In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [62]:
from math import pow
import random
import json
from collections import defaultdict

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap


class RoundRobinSamplingBitmap:
    """A bitmap sketch with fixed max size via variable sampling"""    
    
    MAX_uint32_t = 0xFFFFFFFF ## TODO find a more pythonic way to assert this
    
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples):        
        self.maxSamples = maxSamples
        self.curSampleRate = 1   # the rate is 1/curSampleRate, so if the value is 4, it is 1/4th. start at 1/1
        self.rbm = RoaringBitmap()  

    def copy(self):
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples)
        
        result.rbm = self.rbm.copy()
        result.curSampleRate = self.curSampleRate
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to grow our sampling rate and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curSampleRate *= 2 # TODO consider different sampleRate scaling instead of 2
            self._prune()

    def _getCurMaxAllowed(self):
        return int(self.MAX_uint32_t / self.curSampleRate)
    
    def _prune(self):          
        curMaxAllowed = self._getCurMaxAllowed()
        if self.rbm.max() > curMaxAllowed:
            self.rbm = self.rbm.clamp(0, curMaxAllowed)

    def add(self, i):          
        if i < 0: raise RuntimeError("cannot yet support negative id values")      
        self.rbm.add(i)
        self._checkSize()        
        return i
    
    def _performBitmapOp(self, other, opStr):     
                
        ## currently need to ensure the instances have the same maxSamples init values
        if self.maxSamples != other.maxSamples:
            raise RuntimeError("you cannot mix instances with different initial maxSamples values")
        ## TODO figure out how to extrapolate, if possible, to allow different init values
                     
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples)
               
        result.rbm = getattr(self.rbm, opStr)(other.rbm)
        result.curSampleRate = max(self.curSampleRate, other.curSampleRate)
        
        result._checkSize()
        return result

    
    ## the following should work logically but havent really been tested with real data much
    def intersection(self, other):        
        return self._performBitmapOp(other, "intersection")

    def union(self, other):        
        return self._performBitmapOp(other, "union")
                 
    def difference(self, other):        
        return self._performBitmapOp(other, "difference")
    ## the above should work logically but havent really been tested with real data much

    
    def estimatedCardinality(self):
        return len(self.rbm)*self.curSampleRate
        
    def _getCurMaxAllowed(self):
        return int(self.MAX_uint32_t / self.curSampleRate)
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "curSampleRate" : self.curSampleRate,
            "curMax" : self.rbm.max(),                        
            "getCurMaxAllowed" : self._getCurMaxAllowed(),                        
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

        
print(RoundRobinSamplingBitmap)

<class '__main__.RoundRobinSamplingBitmap'>


In [63]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RoundRobinSamplingBitmap(maxSamples=1000)

iterations = 1000*1000 #100*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 50
errHistogram = defaultdict(lambda:0)

for i in range(1,iterations+1):
        
    rbmrr.add(random.randint(0,rbmrr.MAX_uint32_t))
    errHistogram[rbmrr.errPct(i)] += 1
    
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



20000 0.024 {"estCard": 20480, "curSampleRate": 32, "curMax": 134120865, "getCurMaxAllowed": 134217727}
40000 -0.0176 {"estCard": 39296, "curSampleRate": 64, "curMax": 66841632, "getCurMaxAllowed": 67108863}
60000 -0.0122 {"estCard": 59264, "curSampleRate": 64, "curMax": 66889151, "getCurMaxAllowed": 67108863}
80000 -0.0528 {"estCard": 75776, "curSampleRate": 128, "curMax": 33527714, "getCurMaxAllowed": 33554431}
100000 -0.0451 {"estCard": 95488, "curSampleRate": 128, "curMax": 33550992, "getCurMaxAllowed": 33554431}
120000 -0.0304 {"estCard": 116352, "curSampleRate": 128, "curMax": 33550992, "getCurMaxAllowed": 33554431}
140000 0.0038 {"estCard": 140544, "curSampleRate": 256, "curMax": 16747826, "getCurMaxAllowed": 16777215}
160000 0.0112 {"estCard": 161792, "curSampleRate": 256, "curMax": 16747826, "getCurMaxAllowed": 16777215}
180000 -0.0058 {"estCard": 178944, "curSampleRate": 256, "curMax": 16747826, "getCurMaxAllowed": 16777215}
200000 -0.0118 {"estCard": 197632, "curSampleRate":

In [64]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

0.0 6334 0.006334
0.0001 5505 0.011839
0.0002 5698 0.017537
0.0003 5617 0.023154
0.0004 5685 0.028839
0.0005 5573 0.034412
0.0006 5535 0.039947
0.0007 5409 0.045356
0.0008 5423 0.050779
0.0009 5356 0.056135
0.001 5476 0.061611
0.0011 5483 0.067094
0.0012 5244 0.072338
0.0013 4890 0.077228
0.0014 5011 0.082239
0.0015 4829 0.087068
0.0016 4636 0.091704
0.0017 4909 0.096613
0.0018 5011 0.101624
0.0019 4771 0.106395
0.002 4793 0.111188
0.0021 4740 0.115928
0.0022 4829 0.120757
0.0023 4903 0.12566
0.0024 4881 0.130541
0.0025 4960 0.135501
0.0026 5047 0.140548
0.0027 5135 0.145683
0.0028 5073 0.150756
0.0029 4826 0.155582
0.003 4709 0.160291
0.0031 4567 0.164858
0.0032 4633 0.169491
0.0033 4678 0.174169
0.0034 4852 0.179021
0.0035 4854 0.183875
0.0036 4735 0.18861
0.0037 4978 0.193588
0.0038 4889 0.198477
0.0039 5061 0.203538
0.004 4934 0.208472
0.0041 4866 0.213338
0.0042 4662 0.218
0.0043 4722 0.222722
0.0044 5205 0.227927
0.0045 5213 0.23314
0.0046 4937 0.238077
0.0047 4703 0.24278
0.0048

In [None]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshotSize = max(snapshots.keys())
largestSnapshot = snapshots[largestSnapshotSize]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshotSize)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, intersect.errPct(k), intersect.dumps())





In [None]:
mrb = MultiRoaringBitmap([v.rbm.freeze() for v in snapshots.values()])
print("avg snapshot size: %d bytes" % (mrb.bufsize()/len(snapshots.values())))

snapshotBufsize = [(k,v.curSampleRate,MultiRoaringBitmap([v.rbm]).bufsize()) for (k,v) in snapshots.items()]
sorted(snapshotBufsize)

In [60]:


foo[1] 

0