In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [None]:
from math import pow
import random
import json

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap

class RoundRobinSamplingBitmap:
    """A bitmap sketch with fixed max size via variable sampling"""    
    
    MAX_uint32_t = 0xFFFFFFFF ## TODO find a more pythonic way to assert this. this is hard in python3
    
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples, initMaxAllowed=MAX_uint32_t):        
        self.maxSamples = maxSamples
        self.initMaxAllowed = initMaxAllowed
        
        self.rbm = RoaringBitmap()  
        self.curMaxAllowed = initMaxAllowed

    def copy(self):
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.copy()
        result.curMaxAllowed = self.curMaxAllowed
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to shrink our curMaxAllowed value and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curMaxAllowed = int(self.curMaxAllowed/2)   # TODO consider shrinking by something other than half
            self._prune()

    def _prune(self):          
        ## TODO before bother to clamp, check if needed
        self.rbm = self.rbm.clamp(0, self.curMaxAllowed)

    def add(self, i):  
        
        if i < 0:
            raise 
        
        self.rbm.add(i)
        self._checkSize()        
        return i
    
    def intersection(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        result.rbm = self.rbm.intersection(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        
        result._checkSize()
        return result
    
    def union(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        result.rbm = self.rbm.union(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        
        result._checkSize()
        return result
    
    
    ### TODO implement NOT for the set
                 
    def estimatedCardinality(self):
        return len(self.rbm)*(self.initMaxAllowed/self.curMaxAllowed)
    
    def curMax(self):
        return self.rbm.max()
    
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "curMax" : self.curMax(),                        
            "curMaxAllowed" : self.curMaxAllowed,    
            "initMaxAllowed" : self.initMaxAllowed
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

print(RoundRobinSamplingBitmap)


In [None]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RoundRobinSamplingBitmap(maxSamples=10*1000)

getRandomInRange = lambda: random.randint(0,rbmrr.initMaxAllowed)  
iterations = 100*1000

snapshots = {}
NUM_SNAPSHOTS = 200
errHistogram = {} # TODO consider using defaultdict

for i in range(1,iterations+1):
    
    # if i % (iterations/1000) == 0: print("processing %d" % i)
        
    rbmrr.add(getRandomInRange())
    
    errPct = rbmrr.errPct(i)
    if errPct not in errHistogram:
        errHistogram[errPct] = 1
    else:
        errHistogram[errPct] += 1
            
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



In [None]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

In [None]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshotSize = max(snapshots.keys())
largestSnapshot = snapshots[largestSnapshotSize]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshotSize)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, intersect.errPct(k), intersect.dumps())



