In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [3]:
from math import pow
import random
import json

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap

class RbmRoundRobyn:
    """A bitmap sketch with fixed max size via variable sampling"""    
    
    MAX_uint32_t = 0xFFFFFFFF ## TODO find a more pythonic way to assert this. this is hard in python3
    
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples, initMaxAllowed=MAX_uint32_t):        
        self.maxSamples = maxSamples
        self.initMaxAllowed = initMaxAllowed
        
        self.rbm = RoaringBitmap()  
        self.curMaxAllowed = initMaxAllowed
        self.numAdded = 0

    def copy(self):
        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.copy()
        result.curMaxAllowed = self.curMaxAllowed
        result.numAdded = self.numAdded
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to shrink our curMaxAllowed value and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curMaxAllowed = int(self.curMaxAllowed/2)   # TODO consider shrinking by something other than half
            self._prune()

    def _prune(self):  
        self.rbm = self.rbm.clamp(0, self.curMaxAllowed)

    def add(self, i):  
        
        if i < 0:
            raise 
        
        self.rbm.add(i)
        self._checkSize()        
        self.numAdded += 1
        return i
    
    def intersection(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.intersection(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = min(self.numAdded, other.numAdded) ## TODO make sure this makes sense
        
        result._checkSize()
        return result
    
    def union(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.union(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = self.numAdded + other.numAdded
        
        result._checkSize()
        return result
                 
    def estimatedCardinality(self):
        return len(self.rbm)*(self.initMaxAllowed/self.curMaxAllowed)
    
    def curMax(self):
        return self.rbm.max()
    
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "numAdded" : self.numAdded,
            "curMax" : self.curMax(),                        
            "curMaxAllowed" : self.curMaxAllowed,                        
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

print(RbmRoundRobyn)


<class '__main__.RbmRoundRobyn'>


In [4]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RbmRoundRobyn(maxSamples=10000)

getRandomInRange = lambda: random.randint(0,rbmrr.initMaxAllowed)  
iterations = 100*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 200
errHistogram = {} # TODO consider using defaultdict

for i in range(1,iterations+1):
    
    # if i % (iterations/1000) == 0: print("processing %d" % i)
        
    rbmrr.add(getRandomInRange())
    
    errPct = rbmrr.errPct(i)
    if errPct not in errHistogram:
        errHistogram[errPct] = 1
    else:
        errHistogram[errPct] += 1
            
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



500000 -0.0147 {"estCard": 492608, "numAdded": 500000, "curMax": 67106386, "curMaxAllowed": 67108863.984375}
1000000 -0.0182 {"estCard": 981760, "numAdded": 1000000, "curMax": 33554156, "curMaxAllowed": 33554431.9921875}
1500000 -0.0082 {"estCard": 1487616, "numAdded": 1500000, "curMax": 16774902, "curMaxAllowed": 16777215.99609375}
2000000 -0.0003 {"estCard": 1999360, "numAdded": 2000000, "curMax": 16776480, "curMaxAllowed": 16777215.99609375}
2500000 0.0025 {"estCard": 2506496, "numAdded": 2500000, "curMax": 16776480, "curMaxAllowed": 16777215.99609375}
3000000 0.0024 {"estCard": 3007488, "numAdded": 3000000, "curMax": 8387791, "curMaxAllowed": 8388607.998046875}
3500000 -0.0007 {"estCard": 3497472, "numAdded": 3500000, "curMax": 8387792, "curMaxAllowed": 8388607.998046875}
4000000 0.0039 {"estCard": 4015616, "numAdded": 4000000, "curMax": 8387792, "curMaxAllowed": 8388607.998046875}
4500000 0.0043 {"estCard": 4519424, "numAdded": 4500000, "curMax": 8387792, "curMaxAllowed": 8388607.

36500000 -0.0011 {"estCard": 36458496, "numAdded": 36500000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
37000000 -0.0007 {"estCard": 36970496, "numAdded": 37000000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
37500000 -0.0021 {"estCard": 37421056, "numAdded": 37500000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
38000000 -0.0028 {"estCard": 37892096, "numAdded": 38000000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
38500000 -0.004 {"estCard": 38342656, "numAdded": 38500000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
39000000 -0.0034 {"estCard": 38866944, "numAdded": 39000000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
39500000 -0.0041 {"estCard": 39337984, "numAdded": 39500000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
40000000 -0.0036 {"estCard": 39854080, "numAdded": 40000000, "curMax": 1048498, "curMaxAllowed": 1048575.9997558594}
40500000 -0.0022 {"estCard": 40407040, "numAdded": 40500000, "cur

72500000 -0.0079 {"estCard": 71925760, "numAdded": 72500000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
73000000 -0.0078 {"estCard": 72425472, "numAdded": 73000000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
73500000 -0.0077 {"estCard": 72933376, "numAdded": 73500000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
74000000 -0.0087 {"estCard": 73351168, "numAdded": 74000000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
74500000 -0.0087 {"estCard": 73850880, "numAdded": 74500000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
75000000 -0.0089 {"estCard": 74326016, "numAdded": 75000000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
75500000 -0.0081 {"estCard": 74883072, "numAdded": 75500000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
76000000 -0.0085 {"estCard": 75350016, "numAdded": 76000000, "curMax": 524221, "curMaxAllowed": 524287.9998779297}
76500000 -0.0092 {"estCard": 75792384, "numAdded": 76500000, "curMax": 524221, "

In [None]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

In [None]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshot = snapshots[max(snapshots.keys())]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshot.numAdded)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, largestSnapshot.numAdded, intersect.errPct(k), intersect.dumps())

print()

## this gets respective percentiles for the error value 
    
intersectErrPcts =  sorted(intersectErrPcts)
percentileIdxes = {p: int(len(intersectErrPcts)*p/100) for p in [0,50,95,99,100]}

for pct in percentileIdxes:
    print(pct, intersectErrPcts[pct])

