-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_min_sketch.py
72 lines (54 loc) · 1.95 KB
/
count_min_sketch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from typing import List
from hash_generator import HashGenerator
class CountMinSketch():
def __init__(self, K: int, N: int, i_dim: int = 128, i_len: int = 32) -> None:
"""
Initializes Count-Min Sketch class.
Parameters
----------
K: integer
Number of buckets (width).
N: integer
Number of hash functions (height).
i_dim: integer, optional
Input range.
(e.g. for ASCII: n == 128)
i_len: integer, optional
Input length.
"""
self.K = K
self.N = N
self.i_dim = i_dim
self.i_len = i_len
self.hash_generators = [HashGenerator(self.i_dim, self.i_len, self.K) for i in range(self.N)]
for hg in self.hash_generators:
hg.generate_hash_function()
self.sketch = [[0 for i in range(self.K)] for j in range(self.N)]
def process_stream(self, stream: List[object]) -> None:
"""
Executes the CMS algorithm on a stream (list) of elements.
Parameters
----------
stream: List[object]
Stream of objects represented as a list.
"""
for value in stream:
for i in range(self.N):
bucket = self.hash_generators[i].hash(value)
self.sketch[i][bucket] += 1
def get_freq(self, values: List[object]) -> List[int]:
"""
Returns the frequencies of elements in the CMS.
Parameters
----------
values: List[object]
List of elements for which to return frequencies.
"""
l = len(values)
self.frequencies = {}
for val_idx in range(l):
if values[val_idx] not in self.frequencies:
self.frequencies[values[val_idx]] = \
min([self.sketch[i][self.hash_generators[i].hash(values[val_idx])] \
for i in range(self.N)])
return self.frequencies