# Stream Algorithms

In [1]:
import numpy as np
import random
import string

In [2]:
def randomString(n=5):
    """
    Creates a alpha-numeric string of requested length (securely)
    """
    return ''.join(random.SystemRandom().choice(string.uppercase + string.digits) for _ in xrange(n))

randomString()

'N76MD'

In [3]:
class StreamSampler(object):
    def __init__(self, proportion, key):
        self.proportion = np.around(proportion, decimals=2)
        self.threshold = self.proportion*100
        self.hashFunction = self.createHash(proportion)
        self.key = key
        self.sample = []
    
    def __len__(self):
        return len(self.sample)
    
    def __iter__(self):
        return iter(self.sample)
    
    def __getitem__(self, idx):
        if len(self) <= idx:
            return None
        else:
            return self.sample[idx]
        
    def __getslice__(self, startIdx, endIdx):
        return self.sample[startIdx:endIdx]
    
    def createHash(self,proportion):
        def newHash(value):
            return hash(value) % 100 
        return newHash
    
    def readData(self, newData):
        if self.hashFunction(newData[self.key]) >= self.threshold:
            return None
        else:
            self.sample.append(newData)
    
    def readStream(self, data):
        for datum in data:
            self.readData(datum)
    
test = StreamSampler(0.1, "name")

testStream = [{"name":randomString(n=5)} for _ in xrange(1000)]

test.readStream(testStream)

print len(test)
for item in test[10:13]:
    print item
    
print test[45]

97
{'name': 'ZXZO1'}
{'name': 'J2HIS'}
{'name': 'FDWJV'}
{'name': 'GUB1U'}


In [4]:
class BloomFilter(object):
    def __init__(self, lengthArray, noHashes, keys):
        self.bitArray = np.zeros(lengthArray)
        self.hashes = self.createHashes(noHashes)
        self.keys = set(keys)
        self.hashKeys()
    
    def __len__(self):
        return len(self.bitArray)
    
    def createHashes(self, noHashes):
        def addRandomStuff():
            adder = randomString(n=10)
            def randomHash(value):
                return hash(value + adder) % len(self)
            return randomHash
        return [addRandomStuff() for _ in xrange(noHashes)]
    
    def hashKeys(self):
        for key in self.keys:
            for hashFunc in self.hashes:
                self.bitArray[hashFunc(key)] = 1
                
    def readData(self, datum):
        indices = [hashFunc(datum) for hashFunc in self.hashes]
        if any(self.bitArray[idx] == 0 for idx in indices):
            return "Not allowed"
        else:
            return "Accepted"
        
    def readStream(self, data):
        for datum in data:
            self.readData(datum)

In [5]:
keys = [randomString(n=10) for _ in xrange(1000)]
bloom = BloomFilter(10**7, 10, keys)

stream = keys + [randomString(n=10) for _ in xrange(10000)]

allowed = []
for thing in stream:
    if bloom.readData(thing) == "Accepted":
        allowed.append(thing)

# print len(keys), len(allowed)
allowed, keys = sorted(allowed), sorted(keys)

In [6]:
idx = 0

while allowed[idx] == keys[idx]:
    idx += 1
    if idx == 1000:
        break
print idx

883
