In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [None]:
from math import pow
import random
import json
from collections import defaultdict

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap


class RoundRobinSamplingBitmap:
    """A bitmap sketch with fixed max size via variable sampling"""    
    
    MAX_uint32_t = 0xFFFFFFFF ## TODO find a more pythonic way to assert this
    
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples):        
        self.maxSamples = maxSamples
        self.curSampleRate = 1   # the rate is 1/curSampleRate, so if the value is 4, it is 1/4th. start at 1/1
        self.rbm = RoaringBitmap()  

    def copy(self):
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples)
        
        result.rbm = self.rbm.copy()
        result.curSampleRate = self.curSampleRate
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to grow our sampling rate and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curSampleRate *= 2 # TODO consider different sampleRate scaling instead of 2
            self._prune()

    def _getCurMaxAllowed(self):
        return int(self.MAX_uint32_t / self.curSampleRate)
    
    def _prune(self):          
        curMaxAllowed = self._getCurMaxAllowed()
        if self.rbm.max() > curMaxAllowed:
            self.rbm = self.rbm.clamp(0, curMaxAllowed)

    def add(self, i):          
        if i < 0: raise RuntimeError("cannot yet support negative id values")      
        self.rbm.add(i)
        self._checkSize()        
        return i
    
    def _performBitmapOp(self, other, opStr):     
                
        ## currently need to ensure the instances have the same maxSamples init values
        if self.maxSamples != other.maxSamples:
            raise RuntimeError("you cannot mix instances with different initial maxSamples values")
        ## TODO figure out how to extrapolate, if possible, to allow different init values
                     
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples)
               
        result.rbm = getattr(self.rbm, opStr)(other.rbm)
        result.curSampleRate = max(self.curSampleRate, other.curSampleRate)
        
        result._checkSize()
        return result

    
    ## the following should work logically but havent really been tested with real data much
    def intersection(self, other):        
        return self._performBitmapOp(other, "intersection")

    def union(self, other):        
        return self._performBitmapOp(other, "union")
                 
    def difference(self, other):        
        return self._performBitmapOp(other, "difference")
    ## the above should work logically but havent really been tested with real data much

    
    def estimatedCardinality(self):
        return len(self.rbm)*self.curSampleRate
        
    def _getCurMaxAllowed(self):
        return int(self.MAX_uint32_t / self.curSampleRate)
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "curSampleRate" : self.curSampleRate,
            "curMax" : self.rbm.max(),                        
            "getCurMaxAllowed" : self._getCurMaxAllowed(),                        
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
                

print(RoundRobinSamplingBitmap)

In [None]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RoundRobinSamplingBitmap(maxSamples=1000)

iterations = 1000*1000 #100*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 50
errHistogram = defaultdict(lambda:0)

for i in range(1,iterations+1):
        
    rbmrr.add(random.randint(0,rbmrr.MAX_uint32_t))
    errHistogram[rbmrr.errPct(i)] += 1
    
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



In [None]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

In [None]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshotSize = max(snapshots.keys())
largestSnapshot = snapshots[largestSnapshotSize]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshotSize)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, intersect.errPct(k), intersect.dumps())





In [None]:
mrb = MultiRoaringBitmap([v.rbm.freeze() for v in snapshots.values()])
print("avg snapshot size: %d bytes" % (mrb.bufsize()/len(snapshots.values())))

snapshotBufsize = [(k,v.curSampleRate,MultiRoaringBitmap([v.rbm]).bufsize()) for (k,v) in snapshots.items()]
sorted(snapshotBufsize)