In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [9]:
from math import pow
import random
import json

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap

class RbmRoundRobyn:
    """A bitmap sketch with fixed max size via variable sampling"""    
    # TODO replace default initMaxAllowed with pythonic MAX_INT
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples, initMaxAllowed=pow(2,32)):        
        self.maxSamples = maxSamples
        self.initMaxAllowed = initMaxAllowed
        
        self.rbm = RoaringBitmap()  
        self.curMaxAllowed = initMaxAllowed
        self.numAdded = 0

    def copy(self):
        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.copy()
        result.curMaxAllowed = self.curMaxAllowed
        result.numAdded = self.numAdded
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to shrink our curMaxAllowed value and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curMaxAllowed /= 2   # TODO consider shrinking by something other than half
            self._prune()

    ## TODO use clamp() for this instead
    def _prune(self):  
        if len(self.rbm) == 0:
            return
        while self.rbm.max() > self.curMaxAllowed:
            self.rbm.pop()
        return
                
    def add(self, i):        
        self.rbm.add(i)
        self._checkSize()        
        self.numAdded += 1
        return i
    
    def intersection(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.intersection(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = min(self.numAdded, other.numAdded) ## TODO make sure this makes sense
        
        result._checkSize()
        return result
    
    def union(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.union(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = self.numAdded + other.numAdded
        
        result._checkSize()
        return result
    
             
    def estimatedCardinality(self):
        return len(self.rbm)*(self.initMaxAllowed/self.curMaxAllowed)
    
    def curMax(self):
        return self.rbm.max()
    
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "numAdded" : self.numAdded,
            "curMax" : self.curMax(),                        
            "curMaxAllowed" : self.curMaxAllowed,                        
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

print(RbmRoundRobyn)


<class '__main__.RbmRoundRobyn'>


In [3]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RbmRoundRobyn(maxSamples=10000)

getRandomInRange = lambda: random.randint(0,rbmrr.initMaxAllowed)  
iterations = 100*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 200
errHistogram = {} # TODO consider using defaultdict

for i in range(1,iterations+1):
    
    # if i % (iterations/1000) == 0: print("processing %d" % i)
        
    rbmrr.add(getRandomInRange())
    
    errPct = rbmrr.errPct(i)
    if errPct not in errHistogram:
        errHistogram[errPct] = 1
    else:
        errHistogram[errPct] += 1
            
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



500000 -0.0008 {"estCard": 499584, "numAdded": 500000, "curMax": 67107150, "curMaxAllowed": 67108864.0}
1000000 -0.0108 {"estCard": 989184, "numAdded": 1000000, "curMax": 33552248, "curMaxAllowed": 33554432.0}
1500000 -0.012 {"estCard": 1481984, "numAdded": 1500000, "curMax": 16777113, "curMaxAllowed": 16777216.0}
2000000 -0.0158 {"estCard": 1968384, "numAdded": 2000000, "curMax": 16777113, "curMaxAllowed": 16777216.0}
2500000 -0.0122 {"estCard": 2469376, "numAdded": 2500000, "curMax": 16777113, "curMaxAllowed": 16777216.0}
3000000 -0.0046 {"estCard": 2985984, "numAdded": 3000000, "curMax": 8387861, "curMaxAllowed": 8388608.0}
3500000 -0.0071 {"estCard": 3474944, "numAdded": 3500000, "curMax": 8387861, "curMaxAllowed": 8388608.0}
4000000 -0.0131 {"estCard": 3947520, "numAdded": 4000000, "curMax": 8387861, "curMaxAllowed": 8388608.0}
4500000 -0.0125 {"estCard": 4443648, "numAdded": 4500000, "curMax": 8387861, "curMaxAllowed": 8388608.0}
5000000 -0.0099 {"estCard": 4950016, "numAdded": 5

39000000 -0.0208 {"estCard": 38187008, "numAdded": 39000000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
39500000 -0.0224 {"estCard": 38612992, "numAdded": 39500000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
40000000 -0.0209 {"estCard": 39161856, "numAdded": 40000000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
40500000 -0.022 {"estCard": 39608320, "numAdded": 40500000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
41000000 -0.0228 {"estCard": 40062976, "numAdded": 41000000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
41500000 -0.0244 {"estCard": 40484864, "numAdded": 41500000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
42000000 -0.0247 {"estCard": 40960000, "numAdded": 42000000, "curMax": 1048510, "curMaxAllowed": 1048576.0}
42500000 -0.0221 {"estCard": 41558016, "numAdded": 42500000, "curMax": 524127, "curMaxAllowed": 524288.0}
43000000 -0.0262 {"estCard": 41869312, "numAdded": 43000000, "curMax": 524127, "curMaxAllowed": 524288.0}
43500000 -0.0269 {"estCard": 4232

78000000 -0.0203 {"estCard": 76414976, "numAdded": 78000000, "curMax": 524236, "curMaxAllowed": 524288.0}
78500000 -0.0207 {"estCard": 76873728, "numAdded": 78500000, "curMax": 524236, "curMaxAllowed": 524288.0}
79000000 -0.0207 {"estCard": 77357056, "numAdded": 79000000, "curMax": 524236, "curMaxAllowed": 524288.0}
79500000 -0.0209 {"estCard": 77832192, "numAdded": 79500000, "curMax": 524236, "curMaxAllowed": 524288.0}
80000000 -0.0205 {"estCard": 78356480, "numAdded": 80000000, "curMax": 524236, "curMaxAllowed": 524288.0}
80500000 -0.0213 {"estCard": 78782464, "numAdded": 80500000, "curMax": 524236, "curMaxAllowed": 524288.0}
81000000 -0.021 {"estCard": 79298560, "numAdded": 81000000, "curMax": 524236, "curMaxAllowed": 524288.0}
81500000 -0.0201 {"estCard": 79855616, "numAdded": 81500000, "curMax": 524236, "curMaxAllowed": 524288.0}
82000000 -0.0207 {"estCard": 80297984, "numAdded": 82000000, "curMax": 524236, "curMaxAllowed": 524288.0}
82500000 -0.0205 {"estCard": 80805888, "numAdde

In [6]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

0.0 37274 0.00037274
0.0001 27828 0.00065102
0.0002 27407 0.00092509
0.0003 26939 0.00119448
0.0004 26230 0.00145678
0.0005 25104 0.00170782
0.0006 22575 0.00193357
0.0007 30229 0.00223586
0.0008 68512 0.00292098
0.0009 99473 0.00391571
0.001 72936 0.00464507
0.0011 59689 0.00524196
0.0012 77898 0.00602094
0.0013 105953 0.00708047
0.0014 103684 0.00811731
0.0015 118329 0.0093006
0.0016 118185 0.01048245
0.0017 105216 0.01153461
0.0018 133409 0.0128687
0.0019 162919 0.01449789
0.002 149259 0.01599048
0.0021 139737 0.01738785
0.0022 195504 0.01934289
0.0023 327804 0.02262093
0.0024 411781 0.02673874
0.0025 497630 0.03171504
0.0026 445171 0.03616675
0.0027 336560 0.03953235
0.0028 344869 0.04298104
0.0029 309607 0.04607711
0.003 241432 0.04849143
0.0031 158691 0.05007834
0.0032 152842 0.05160676
0.0033 132039 0.05292715
0.0034 104865 0.0539758
0.0035 167289 0.05564869
0.0036 235104 0.05799973
0.0037 271664 0.06071637
0.0038 258300 0.06329937
0.0039 246218 0.06576155
0.004 246540 0.0682269

In [54]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshot = snapshots[max(snapshots.keys())]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshot.numAdded)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, largestSnapshot.numAdded, intersect.errPct(k), intersect.dumps())

print()

## this gets respective percentiles for the error value 
    
intersectErrPcts =  sorted(intersectErrPcts)
percentileIdxes = {p: int(len(intersectErrPcts)*p/100) for p in [0,50,95,99,100]}

for pct in percentileIdxes:
    print(pct, intersectErrPcts[pct])



intersecting with the largest snapshot with 100000000 entries

500000 100000000 -0.4757 {"estCard": 262144, "numAdded": 500000, "curMax": 259097, "curMaxAllowed": 262144.0}
1000000 100000000 -0.2135 {"estCard": 786432, "numAdded": 1000000, "curMax": 259097, "curMaxAllowed": 262144.0}
1500000 100000000 -0.1043 {"estCard": 1343488, "numAdded": 1500000, "curMax": 260492, "curMaxAllowed": 262144.0}
2000000 100000000 -0.1562 {"estCard": 1687552, "numAdded": 2000000, "curMax": 260492, "curMaxAllowed": 262144.0}
2500000 100000000 -0.089 {"estCard": 2277376, "numAdded": 2500000, "curMax": 260492, "curMaxAllowed": 262144.0}
3000000 100000000 -0.0879 {"estCard": 2736128, "numAdded": 3000000, "curMax": 262077, "curMaxAllowed": 262144.0}
3500000 100000000 -0.0824 {"estCard": 3211264, "numAdded": 3500000, "curMax": 262077, "curMaxAllowed": 262144.0}
4000000 100000000 -0.0865 {"estCard": 3653632, "numAdded": 4000000, "curMax": 262077, "curMaxAllowed": 262144.0}
4500000 100000000 -0.0824 {"estCard": 