forked from yunwilliamyu/hyperminhash
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test3.py
62 lines (46 loc) · 1.83 KB
/
test3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from hyperminhash import HyperMinHash
import sys
seed = 'a'
n_keys = 1 << 20
mod = 999
ratio = 0.001
index_bits = 16
minhash_bits = 16
hll = HyperMinHash(index_bits, 6, 0)
hll_small = HyperMinHash(index_bits, 6, 0)
hmh = HyperMinHash(index_bits, 6, minhash_bits)
hmh_small = HyperMinHash(index_bits, 6, minhash_bits)
def calc_error(estimate, expected):
if (estimate > expected):
return 100 * (estimate - expected) / expected
return 100 * (expected - estimate) / expected
print("seed {} n_keys {} mod {} ratio {} index_bits {} minhash_bits {}".format(
seed, n_keys, mod, ratio, index_bits, minhash_bits))
for i in range(1, n_keys + 1):
if i & (i - 1) == 0:
target = i * ratio
try:
hll_count = hll.count()
hll_small_count = hll_small.count()
hll_intersect_count = hll.intersection(hll_small)[0]
hll_error = calc_error(hll_intersect_count, target)
hmh_count = hmh.count()
hmh_small_count = hmh_small.count()
hmh_intersect_count = hmh.intersection(hmh_small)[0]
hmh_error = calc_error(hmh_intersect_count, target)
print("hll - count ({:10.0f} {:10.0f} {:10.0f}) intersect {:10.0f} target {:10.0f} error {:3.3f}".format(
i, hll_count, hll_small_count, hll_intersect_count, target,
hll_error))
print("hmh - count ({:10.0f} {:10.0f} {:10.0f}) intersect {:10.0f} target {:10.0f} error {:3.3f}".format(
i, hmh_count, hmh_small_count, hmh_intersect_count, target,
hmh_error))
sys.stdout.flush()
except ValueError: # ignore nan case
pass
key = "{}|{}".format(seed, i)
hll.update([key])
hmh.update([key])
if i % mod == 0:
hll_small.update([key])
hmh_small.update([key])
print("DONE")