In [1]:
import mmh3
import numpy as np
import pandas as pd

from utils import *
from tqdm.auto import tqdm

# Задание 1

In [2]:
from bloom_filter import BloomFilter

In [3]:
bloom_sizes = [8, 64, 1024, 64000, 16000000]
set_sizes = [5, 50, 500, 5000, 5000000]

Сгенерируем файлы с уникальными ключами для тестирования

In [3]:
# for size in set_sizes:
#     gen_uniq_seq(f'bloom_{size}_seq.txt', size)

Протестируем реализованный фильтр

In [4]:
matrix_res = np.zeros(
    (len(bloom_sizes) * len(set_sizes), 4),
    dtype=int
)

curent_row_idx = 0 
for b_size in bloom_sizes:
    for set_size in set_sizes:
        matrix_res[curent_row_idx, 0] = b_size
        matrix_res[curent_row_idx, 1] = set_size

        b_filter = BloomFilter(b_size)

        with open(f'bloom_{set_size}_seq.txt', 'r') as f:
            s_iter = tqdm(
                (line for line in f),
                total=set_size,
                desc=f'bloom {b_size} with {set_size} seq'
            )
            for s in s_iter:            
                if b_filter.get(s):
                    matrix_res[curent_row_idx, 2] += 1

                b_filter.put(s)
        
        matrix_res[curent_row_idx, 3] += b_filter.size()

        curent_row_idx += 1

bloom 8 with 5 seq:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 8 with 50 seq:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 8 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 8 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 8 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64 with 5 seq:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64 with 50 seq:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5 seq:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 1024 with 50 seq:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 1024 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64000 with 5 seq:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64000 with 50 seq:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64000 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64000 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64000 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 16000000 with 5 seq:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 16000000 with 50 seq:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 16000000 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 16000000 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 16000000 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

In [5]:
table_res = pd.DataFrame(matrix_res, columns=['bf_size', 'set_size', 'fp_count', 'ones_count'])
table_res.to_csv('result_task_1.csv', index=False)

In [6]:
table_res

Unnamed: 0,bf_size,set_size,fp_count,ones_count
0,8,5,1,4
1,8,50,42,8
2,8,500,492,8
3,8,5000,4992,8
4,8,5000000,4999992,8
5,64,5,0,5
6,64,50,18,32
7,64,500,436,64
8,64,5000,4936,64
9,64,5000000,4999936,64


**Вывод:** 
1. При увелечение фильтра уменьшается число ложноположительного ответа (хеш уже существует, но такую строку ещё не добавляли), важно выбирать оптимальное значение фильтра относительно размера текста, т.е. чем меньше выборка с текстом, тем меньше можно задавать фильтр

# Задание 2

In [7]:
from bloom_filter_n_hash import BloomFilterNHash

In [8]:
count_hashes = [1, 2, 3, 4]

Протестируем реализованный фильтр

In [9]:
matrix_res_hashes = np.zeros(
    (len(bloom_sizes) * len(set_sizes) * len(count_hashes) , 5),
    dtype=int
)

curent_row_idx = 0 
for b_size in bloom_sizes:
    for set_size in set_sizes:
        for k_hash in count_hashes:
            matrix_res_hashes[curent_row_idx, 0] = k_hash
            matrix_res_hashes[curent_row_idx, 1] = b_size
            matrix_res_hashes[curent_row_idx, 2] = set_size

            b_filter = BloomFilterNHash(k_hash, b_size)

            with open(f'bloom_{set_size}_seq.txt', 'r') as f:
                s_iter = tqdm(
                    (line for line in f),
                    total=set_size,
                    desc=f'bloom {b_size} with {set_size} seq and with {k_hash} hashes'
                )
                for s in s_iter:            
                    if b_filter.get(s):
                        matrix_res_hashes[curent_row_idx, 3] += 1

                    b_filter.put(s)
            
            matrix_res_hashes[curent_row_idx, 4] += b_filter.size()

            curent_row_idx += 1

bloom 8 with 5 seq and with 1 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 8 with 5 seq and with 2 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 8 with 5 seq and with 3 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 8 with 5 seq and with 4 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 8 with 50 seq and with 1 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 8 with 50 seq and with 2 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 8 with 50 seq and with 3 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 8 with 50 seq and with 4 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 8 with 500 seq and with 1 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 8 with 500 seq and with 2 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 8 with 500 seq and with 3 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 8 with 500 seq and with 4 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 8 with 5000 seq and with 1 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 8 with 5000 seq and with 2 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 8 with 5000 seq and with 3 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 8 with 5000 seq and with 4 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 8 with 5000000 seq and with 1 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 8 with 5000000 seq and with 2 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 8 with 5000000 seq and with 3 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 8 with 5000000 seq and with 4 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64 with 5 seq and with 1 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64 with 5 seq and with 2 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64 with 5 seq and with 3 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64 with 5 seq and with 4 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64 with 50 seq and with 1 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64 with 50 seq and with 2 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64 with 50 seq and with 3 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64 with 50 seq and with 4 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64 with 500 seq and with 1 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64 with 500 seq and with 2 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64 with 500 seq and with 3 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64 with 500 seq and with 4 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64 with 5000 seq and with 1 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64 with 5000 seq and with 2 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64 with 5000 seq and with 3 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64 with 5000 seq and with 4 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64 with 5000000 seq and with 1 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64 with 5000000 seq and with 2 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64 with 5000000 seq and with 3 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64 with 5000000 seq and with 4 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5 seq and with 1 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 1024 with 5 seq and with 2 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 1024 with 5 seq and with 3 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 1024 with 5 seq and with 4 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 1024 with 50 seq and with 1 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 1024 with 50 seq and with 2 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 1024 with 50 seq and with 3 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 1024 with 50 seq and with 4 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 1024 with 500 seq and with 1 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq and with 2 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq and with 3 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq and with 4 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 5000 seq and with 1 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq and with 2 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq and with 3 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq and with 4 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq and with 1 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq and with 2 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq and with 3 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq and with 4 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64000 with 5 seq and with 1 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64000 with 5 seq and with 2 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64000 with 5 seq and with 3 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64000 with 5 seq and with 4 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 64000 with 50 seq and with 1 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64000 with 50 seq and with 2 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64000 with 50 seq and with 3 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64000 with 50 seq and with 4 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 64000 with 500 seq and with 1 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64000 with 500 seq and with 2 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64000 with 500 seq and with 3 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64000 with 500 seq and with 4 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 64000 with 5000 seq and with 1 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64000 with 5000 seq and with 2 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64000 with 5000 seq and with 3 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64000 with 5000 seq and with 4 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 64000 with 5000000 seq and with 1 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64000 with 5000000 seq and with 2 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64000 with 5000000 seq and with 3 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 64000 with 5000000 seq and with 4 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 16000000 with 5 seq and with 1 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 16000000 with 5 seq and with 2 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 16000000 with 5 seq and with 3 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 16000000 with 5 seq and with 4 hashes:   0%|          | 0/5 [00:00<?, ?it/s]

bloom 16000000 with 50 seq and with 1 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 16000000 with 50 seq and with 2 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 16000000 with 50 seq and with 3 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 16000000 with 50 seq and with 4 hashes:   0%|          | 0/50 [00:00<?, ?it/s]

bloom 16000000 with 500 seq and with 1 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 16000000 with 500 seq and with 2 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 16000000 with 500 seq and with 3 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 16000000 with 500 seq and with 4 hashes:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 16000000 with 5000 seq and with 1 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 16000000 with 5000 seq and with 2 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 16000000 with 5000 seq and with 3 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 16000000 with 5000 seq and with 4 hashes:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 16000000 with 5000000 seq and with 1 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 16000000 with 5000000 seq and with 2 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 16000000 with 5000000 seq and with 3 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 16000000 with 5000000 seq and with 4 hashes:   0%|          | 0/5000000 [00:00<?, ?it/s]

In [10]:
table_res_2 = pd.DataFrame(matrix_res_hashes, 
                         columns=['k_hash', 'bf_size', 'set_size', 'fp_count', 'ones_count'])
table_res_2.to_csv('result_task_2.csv', index=False)

In [11]:
with pd.option_context("display.max_rows", 1000):
    display(table_res_2)

Unnamed: 0,k_hash,bf_size,set_size,fp_count,ones_count
0,1,8,5,1,4
1,2,8,5,1,3
2,3,8,5,2,2
3,4,8,5,2,2
4,1,8,50,42,8
5,2,8,50,45,4
6,3,8,50,45,2
7,4,8,50,47,2
8,1,8,500,492,8
9,2,8,500,495,4


**Вывод**:

При правильном выборе размера фильтра в среднем происходит уменьшение ложноположительного срабатывания с добавлением hash, когда мы считаем несколько hash функций для одной строки

Но при большом количество подсчета hash функций может произойти обратное, так как существует большая вероятность, что два разных значения посчитаются различными hash функциями одинаково

# Задание 3

In [12]:
from counter_bloom_filter import ConutersBloomFilter

Определим новые параметры для фильтра (иначе слишком много тестовых значений)

In [13]:
counters_bloom_sizes = [1024, 1600000]
counters_set_sizes = [500, 5000, 5000000]
counters_count_hashes = [1, 3]
counters_nums = [1, 3, 5]

Протестируем реализованный фильтр

In [15]:
matrix_res_counters = np.zeros(
    (len(counters_bloom_sizes) * len(counters_set_sizes) * len(counters_count_hashes) * len(counters_nums), 6),
    dtype=int
)

curent_row_idx = 0 
for b_size in counters_bloom_sizes:
    for set_size in counters_set_sizes:
        for k_hash in counters_count_hashes:
            for c_nums in counters_nums:
                matrix_res_counters[curent_row_idx, 0] = b_size
                matrix_res_counters[curent_row_idx, 1] = set_size
                matrix_res_counters[curent_row_idx, 2] = k_hash
                matrix_res_counters[curent_row_idx, 3] = c_nums

                b_filter = ConutersBloomFilter(hash_num=k_hash, filter_size=b_size, counter_num=c_nums)

                with open(f'bloom_{set_size}_seq.txt', 'r') as f:
                    s_iter = tqdm(
                        (line for line in f),
                        total=set_size,
                        desc=f'bloom {b_size} with {set_size} seq, {k_hash} hashes, {c_nums} counters'
                    )
                    for s in s_iter:            
                        if b_filter.get(s):
                            matrix_res_counters[curent_row_idx, 4] += 1

                        b_filter.put(s)
                
                matrix_res_counters[curent_row_idx, 5] = b_filter.size()

                curent_row_idx += 1

bloom 1024 with 500 seq, 1 hashes, 1 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq, 1 hashes, 3 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq, 1 hashes, 5 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq, 3 hashes, 1 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq, 3 hashes, 3 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 500 seq, 3 hashes, 5 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 1 hashes, 1 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 1 hashes, 3 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 1 hashes, 5 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 3 hashes, 1 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 3 hashes, 3 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000 seq, 3 hashes, 5 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 1 hashes, 1 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 1 hashes, 3 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 1 hashes, 5 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 3 hashes, 1 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 3 hashes, 3 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1024 with 5000000 seq, 3 hashes, 5 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 1 hashes, 1 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 1 hashes, 3 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 1 hashes, 5 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 3 hashes, 1 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 3 hashes, 3 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 500 seq, 3 hashes, 5 counters:   0%|          | 0/500 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 1 hashes, 1 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 1 hashes, 3 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 1 hashes, 5 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 3 hashes, 1 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 3 hashes, 3 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000 seq, 3 hashes, 5 counters:   0%|          | 0/5000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 1 hashes, 1 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 1 hashes, 3 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 1 hashes, 5 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 3 hashes, 1 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 3 hashes, 3 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

bloom 1600000 with 5000000 seq, 3 hashes, 5 counters:   0%|          | 0/5000000 [00:00<?, ?it/s]

In [16]:
table_res_3 = pd.DataFrame(matrix_res_counters, 
                         columns=['bf_size', 'set_size', 'k_hash', 'count_num', 'fp_count', 'ones_count'])
table_res_3.to_csv('result_task_3.csv', index=False)

In [17]:
with pd.option_context("display.max_rows", 1000):
    display(table_res_3)

Unnamed: 0,bf_size,set_size,k_hash,count_num,fp_count,ones_count
0,1024,500,1,1,105,395
1,1024,500,1,3,105,497
2,1024,500,1,5,105,500
3,1024,500,3,1,85,264
4,1024,500,3,3,85,473
5,1024,500,3,5,85,500
6,1024,5000,1,1,3985,1015
7,1024,5000,1,3,3985,2891
8,1024,5000,1,5,3985,4767
9,1024,5000,3,1,4380,341


**Вывод**:


При count_thres (пороге) 1 данный фильтр работает, как простой bloomFilter

Порог должен больше, чем минимальное количество встречаемости одного ключа, иначе этот ключ не будет считаться, даже если мы его добавим

Повышая порог мы моем уменьшать вероятность ошибки. Ведение счетчика позводит нам не только добавлять элементы, но и добавлять.

# Задание 4

In [18]:
from hyper_log_log import HyperLogLog

Протестируем реализованный класс

In [19]:
# точность
precision = [4, 6, 8, 16]
set_sizes = [500, 5000, 5000000]

matrix_res_hll = np.zeros(
    (len(precision) * len(set_sizes), 3),
    dtype=int
)

curent_row_idx = 0 
for b in precision:
    for set_size in set_sizes:
        hll = HyperLogLog(b)
        with open(f'bloom_{set_size}_seq.txt', 'r') as f:
            s_iter = tqdm(
                (line for line in f),
                total=set_size,
                desc=f'precision {b} with {set_size} seq'
            )
            for s in s_iter:          
                hll.put(s)

        matrix_res_hll[curent_row_idx, 0] = b
        matrix_res_hll[curent_row_idx, 1] = set_size
        matrix_res_hll[curent_row_idx, 2] = hll.est_size()

        curent_row_idx += 1 

precision 4 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

precision 4 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

precision 4 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

precision 6 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

precision 6 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

precision 6 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

precision 8 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

precision 8 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

precision 8 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

precision 16 with 500 seq:   0%|          | 0/500 [00:00<?, ?it/s]

precision 16 with 5000 seq:   0%|          | 0/5000 [00:00<?, ?it/s]

precision 16 with 5000000 seq:   0%|          | 0/5000000 [00:00<?, ?it/s]

In [20]:
table_res_4 = pd.DataFrame(matrix_res_hll, 
                         columns=['param ', 'set_size', 'pred_set_size '])
table_res_4.to_csv('result_task_4.csv', index=False)
table_res_4

Unnamed: 0,param,set_size,pred_set_size
0,4,500,160
1,4,5000,191
2,4,5000000,165
3,6,500,2743
4,6,5000,2051
5,6,5000000,2009
6,8,500,1310
7,8,5000,24087
8,8,5000000,26602
9,16,500,216


**Вывод**:

При увеличение параметра b - увеличиваем число регистров и тем, самым уменьшается ошибка

# Задание 5

In [21]:
# Добавлена поддержка генерации с определенным ключем
from task5 import gen_grouped_seq
from task5 import read_csv_keys, count_keys

Сгенерируем 2 файла с повторяющимися ключами (уменьшим кол-во строк с 10 миллиардов до 10 миллионов для экономии ресурсов)

In [23]:
target_table_row_num = 10**6

In [18]:
repeated_nums = [55000, 60000, 65000, 70000, 75000, 80000]
repeated_keys = [f"{key}:{uuid.uuid4()}" for key in range(len(repeated_nums))]
repeated_patterns_1 = [(key, num) for key, num in zip(repeated_keys, repeated_nums)]
repeated_patterns_2 = [(key, num) for key, num in zip(repeated_keys, reversed(repeated_nums))]

print('repeated_patterns in "task5_1.csv":', repeated_patterns_1)
print('repeated_patterns in "task5_2.csv":', repeated_patterns_2)

gen_grouped_seq('task5_1.csv', [*repeated_patterns_1, (target_table_row_num - sum(repeated_nums), 1)])
gen_grouped_seq('task5_2.csv', [*repeated_patterns_2, (target_table_row_num - sum(repeated_nums), 1)])

repeated_patterns in "task5_1.csv": [('0:d364ecb3-97f2-44b9-835e-de6eda176e8b', 55000), ('1:692395d7-2a5a-41bf-b901-0203c00c3377', 60000), ('2:6a0bb504-ed5e-421a-8e5f-0017b41d6d3b', 65000), ('3:ea4e597b-4df7-4e54-9f3e-eeb66c0d6340', 70000), ('4:e3671e67-db99-4208-b475-1e7dd30ee815', 75000), ('5:279a51eb-8464-4b33-a37b-244eb0fff013', 80000)]
repeated_patterns in "task5_2.csv": [('0:d364ecb3-97f2-44b9-835e-de6eda176e8b', 80000), ('1:692395d7-2a5a-41bf-b901-0203c00c3377', 75000), ('2:6a0bb504-ed5e-421a-8e5f-0017b41d6d3b', 70000), ('3:ea4e597b-4df7-4e54-9f3e-eeb66c0d6340', 65000), ('4:e3671e67-db99-4208-b475-1e7dd30ee815', 60000), ('5:279a51eb-8464-4b33-a37b-244eb0fff013', 55000)]


100%|██████████| 1000000/1000000 [00:03<00:00, 288976.36it/s]
100%|██████████| 1000000/1000000 [00:03<00:00, 260407.18it/s]


Реализуем задачу по поиску совпадающих ключекй

In [24]:
filter_size = target_table_row_num // 10
counter_num = count_thres = 60000
hash_num = 5  # т.к. оптимальное кол-во 1200, возьмем меньше для экономии ресурсов

# Создадим ConutersBloomFilter для первого файла
file_1_counter_bf = ConutersBloomFilter(filter_size=filter_size, hash_num=hash_num, counter_num=counter_num, count_thres=count_thres)
# Создадим ConutersBloomFilter для второго файла
# Кол-во добавленных уникальных ключей будет меньше, чем для первого файла, поэтому filter_size должно хватать для записи всех ключей без ложных срабатываний
file_2_counter_bf = ConutersBloomFilter(filter_size=filter_size, hash_num=hash_num, counter_num=counter_num, count_thres=count_thres)

# Посчитаем кол-во ключей в первом файле с помощью ConutersBloomFilter
first_key_iter = tqdm(
    read_csv_keys('task5_1.csv'),
    desc='Collect 1 file',
    total=target_table_row_num
)
count_keys(first_key_iter, counter_bf=file_1_counter_bf)

# Пройдёмся по второму файлу, зафиксируем ключи, которые встречаются нужное количество раз и запомним их
second_key_iter = tqdm(
    read_csv_keys('task5_2.csv'), 
    desc='Collect 2 file',
    total=target_table_row_num
)
second_thres_keys = count_keys(second_key_iter, counter_bf=file_2_counter_bf, sup_counter_bf=file_1_counter_bf, return_keys=True)

print(f'Repeated > {count_thres} keys:', second_thres_keys)

Collect 1 file:   0%|          | 0/1000000 [00:00<?, ?it/s]

Collect 2 file:   0%|          | 0/1000000 [00:00<?, ?it/s]

Add new key: 1:692395d7-2a5a-41bf-b901-0203c00c3377
Add new key: 2:6a0bb504-ed5e-421a-8e5f-0017b41d6d3b
Add new key: 3:ea4e597b-4df7-4e54-9f3e-eeb66c0d6340
Add new key: 4:e3671e67-db99-4208-b475-1e7dd30ee815
Repeated > 60000 keys: {'3:ea4e597b-4df7-4e54-9f3e-eeb66c0d6340', '4:e3671e67-db99-4208-b475-1e7dd30ee815', '1:692395d7-2a5a-41bf-b901-0203c00c3377', '2:6a0bb504-ed5e-421a-8e5f-0017b41d6d3b'}


Все ключи, которые встречались в обоих файлах > 60 000 раз вывелись. Уникальные ключи остались не тронуты.

# Задание 6

In [25]:
from collections import defaultdict

from task6 import count_join_size

Сгенерируем файлы для тестирования

In [26]:
target_table_row_num = 10**6

In [99]:
# Создадим повторяющиеся K раз N одинаковых ключей ([K] * N)
repeated_nums = [12] * 10 + [11] * 20 + [10] * 30 + [8] * 40 + [5] * 80 + [4] * 100 + [3] * 300 + [2] * 500 + [1] * 10**5
print('Num JOIN rows for non-accurate test', sum(num**2 for num in repeated_nums))
# Сгенерируем повторяющиеся ключи
repeated_keys = [f"{key}:{uuid.uuid4()}" for key in range(len(repeated_nums))]
# Сформируем паттерны
repeated_patterns_1 = [(key, num) for key, num in zip(repeated_keys, repeated_nums)]
repeated_patterns_2 = repeated_patterns_1

gen_grouped_seq('task6_1_non-accurate.csv', [*repeated_patterns_1, (target_table_row_num - sum(repeated_nums), 1)])
gen_grouped_seq('task6_2_non-accurate.csv', [*repeated_patterns_2, (target_table_row_num - sum(repeated_nums), 1)])

Num JOIN rows for non-accurate test 117720


Process 1 and 2 file:   0%|          | 0/1000000 [47:08<?, ?it/s]
Process 1 and 2 file:   0%|          | 0/1000000 [45:12<?, ?it/s]
100%|██████████| 1000000/1000000 [00:06<00:00, 157045.22it/s]
100%|██████████| 1000000/1000000 [00:05<00:00, 175271.48it/s]


Итого клоичество строк в JOIN таблице должно быть равно 117 720 (если ключ повторяется в 1 таблице N раз, а в 2 таблице K раз, то он продит N*K строк)

Проверим работу точного тестирования

In [28]:
first_key_iter = read_csv_keys('task6_1_non-accurate.csv')
second_key_iter = read_csv_keys('task6_2_non-accurate.csv') 
accurate_join_size = count_join_size(
    first_key_iter, 
    second_key_iter,
    saved_1_table_keys=None,
    unique_key_thres=target_table_row_num,
    max_unique_key_size=target_table_row_num,
    )
print('Размер JOIN', accurate_join_size)

Use accurate algorythm
Размер JOIN 117720


Проверим работу приблезительного тестирования

In [29]:
first_key_iter = read_csv_keys('task6_1_non-accurate.csv')
second_key_iter = read_csv_keys('task6_2_non-accurate.csv')
saved_1_table_keys = read_csv_keys('task6_1_non-accurate.csv')
non_accurate_join_size = count_join_size(
    first_key_iter, 
    second_key_iter,
    saved_1_table_keys=saved_1_table_keys,
    unique_key_thres=10**3,
    max_unique_key_size=target_table_row_num,
    )
print('Размер JOIN', non_accurate_join_size)

Use non-accurate algorythm
Размер JOIN 130540



**Вывод**:

Тестирование точного ответа отработало корректно, показав нужный результат

Тестирование приближенного ответа выдало ошибку около 11%