In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))

In [2]:
from math import pow
import random
import json

from roaringbitmap import RoaringBitmap, MultiRoaringBitmap

class RoundRobinSamplingBitmap:
    """A bitmap sketch with fixed max size via variable sampling"""    
    
    MAX_uint32_t = 0xFFFFFFFF ## TODO find a more pythonic way to assert this. this is hard in python3
    
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    # TODO consider dealing with negatives in pruning
    def __init__(self, maxSamples, initMaxAllowed=MAX_uint32_t):        
        self.maxSamples = maxSamples
        self.initMaxAllowed = initMaxAllowed
        
        self.rbm = RoaringBitmap()  
        self.curMaxAllowed = initMaxAllowed

    def copy(self):
        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.copy()
        result.curMaxAllowed = self.curMaxAllowed
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to shrink our curMaxAllowed value and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curMaxAllowed = int(self.curMaxAllowed/2)   # TODO consider shrinking by something other than half
            self._prune()

    def _prune(self):          
        ## TODO before bother to clamp, check if needed
        self.rbm = self.rbm.clamp(0, self.curMaxAllowed)

    def add(self, i):          
        if i < 0:
            raise RuntimeError("cannot yet support negative id values")
        
        self.rbm.add(i)
        self._checkSize()        
        return i
    
    def _performBitmapOp(self, other, op):        
        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RoundRobinSamplingBitmap(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
               
        result.rbm = op(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        
        result._checkSize()
        return result
    
    def intersection(self, other):        
        return self._performBitmapOp(other, self.rbm.intersection)

    def union(self, other):        
        return self._performBitmapOp(other, self.rbm.union)
                 
    def difference(self, other):        
        return self._performBitmapOp(other, self.rbm.difference)
    

    def estimatedCardinality(self):
        return len(self.rbm)*(self.initMaxAllowed/self.curMaxAllowed)
    
    def curMax(self):
        return self.rbm.max()
    
    
    ## TODO deprecate this in favor of json dumps
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "curMax" : self.curMax(),                        
            "curMaxAllowed" : self.curMaxAllowed,    
            "initMaxAllowed" : self.initMaxAllowed
        }        
        return json.dumps(state)
    
    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

print(RoundRobinSamplingBitmap)


<class '__main__.RoundRobinSamplingBitmap'>


In [39]:
import time
random.seed(time.time()*1000)


## this cell creates a rbmrr or increasing size and saves off snapshot copies along the way.
## also, for each addition it increments a count for the observed value of error between the 
## actual and estimated cardinality.

rbmrr = RoundRobinSamplingBitmap(maxSamples=10 )#*1000)

iterations = 1000 #100*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 100
errHistogram = {} # TODO consider using defaultdict

for i in range(1,iterations+1):
        
    rbmrr.add(random.randint(0,rbmrr.initMaxAllowed))
    
    errPct = rbmrr.errPct(i)
    if errPct not in errHistogram:
        errHistogram[errPct] = 1
    else:
        errHistogram[errPct] += 1
            
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



10 0.0 {"estCard": 10, "curMax": 4054006276, "curMaxAllowed": 4294967295, "initMaxAllowed": 4294967295}
20 0.0 {"estCard": 20, "curMax": 2105537544, "curMaxAllowed": 2147483647, "initMaxAllowed": 4294967295}
30 0.2 {"estCard": 36, "curMax": 1025654529, "curMaxAllowed": 1073741823, "initMaxAllowed": 4294967295}
40 0.0 {"estCard": 40, "curMax": 536827900, "curMaxAllowed": 536870911, "initMaxAllowed": 4294967295}
50 -0.0399 {"estCard": 48, "curMax": 536827900, "curMaxAllowed": 536870911, "initMaxAllowed": 4294967295}
60 0.0666 {"estCard": 64, "curMax": 536827900, "curMaxAllowed": 536870911, "initMaxAllowed": 4294967295}
70 -0.3142 {"estCard": 48, "curMax": 123999810, "curMaxAllowed": 268435455, "initMaxAllowed": 4294967295}
80 0.0 {"estCard": 80, "curMax": 206216716, "curMaxAllowed": 268435455, "initMaxAllowed": 4294967295}
90 -0.1111 {"estCard": 80, "curMax": 206216716, "curMaxAllowed": 268435455, "initMaxAllowed": 4294967295}
100 -0.0399 {"estCard": 96, "curMax": 206216716, "curMaxAllow

In [40]:
## this cell walks the errHistogram twice to compute percentiles on observed estimation error


## the first traversal is to increment a total count and to compute a new dictory keyed 
## on the abs value of observed differences between the actual and estimated cardinality.
## it also sums the counts overall so we have the total amount.

absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    
##  the second time walks the abs of the error to show cumulative counts.
    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
## with a recent run, for example, the max error was 3.88%. that is, for all keys 
## values added sequentially from 1 and 100,000,000 using at most 10,000 samples,
## the max error was 3.88% and that occured only 488 times. the p95 of error for the
## same keys was 3.49%

    

0.0 19 0.019
0.0015 2 0.021
0.0031 2 0.023
0.0046 1 0.024
0.0047 1 0.025
0.0052 1 0.026
0.0062 4 0.03
0.0069 1 0.031
0.0077 1 0.032
0.0078 1 0.033
0.0087 1 0.034
0.0092 1 0.035
0.0094 1 0.036
0.0103 1 0.037
0.0105 2 0.039
0.0108 1 0.04
0.011 1 0.041
0.0123 4 0.045
0.0126 3 0.048
0.0138 1 0.049
0.014 1 0.05
0.0142 1 0.051
0.0153 1 0.052
0.0158 3 0.055
0.0168 1 0.056
0.0174 1 0.057
0.0175 1 0.058
0.0176 1 0.059
0.0181 1 0.06
0.0184 2 0.062
0.0191 2 0.064
0.0194 1 0.065
0.0199 1 0.066
0.0204 2 0.068
0.0207 1 0.069
0.0212 4 0.073
0.0214 1 0.074
0.0223 1 0.075
0.0229 2 0.077
0.023 1 0.078
0.024 1 0.079
0.0243 3 0.082
0.0249 1 0.083
0.0256 4 0.087
0.0258 1 0.088
0.0267 1 0.089
0.0272 1 0.09
0.0285 2 0.092
0.0289 1 0.093
0.0303 3 0.096
0.0304 1 0.097
0.0305 1 0.098
0.0322 5 0.103
0.0339 1 0.104
0.0341 1 0.105
0.0355 1 0.106
0.0359 1 0.107
0.0361 2 0.109
0.037 1 0.11
0.0372 1 0.111
0.0375 1 0.112
0.0378 1 0.113
0.0389 3 0.116
0.0397 1 0.117
0.0399 2 0.119
0.0406 1 0.12
0.0415 1 0.121
0.0419 1 

0.3476 1 0.827
0.3482 1 0.828
0.3489 1 0.829
0.3495 1 0.83
0.3502 1 0.831
0.3509 1 0.832
0.3515 1 0.833
0.3522 1 0.834
0.3528 1 0.835
0.3535 1 0.836
0.3541 1 0.837
0.3548 1 0.838
0.3554 1 0.839
0.3561 1 0.84
0.3567 1 0.841
0.3574 1 0.842
0.358 1 0.843
0.3587 1 0.844
0.3593 1 0.845
0.3599 1 0.846
0.3655 1 0.847
0.3663 1 0.848
0.3671 1 0.849
0.3679 1 0.85
0.3686 1 0.851
0.4164 1 0.852
0.4172 1 0.853
0.4181 1 0.854
0.419 1 0.855
0.4199 1 0.856
0.4208 1 0.857
0.4216 1 0.858
0.4225 1 0.859
0.4234 1 0.86
0.4242 1 0.861
0.4251 1 0.862
0.426 1 0.863
0.4268 1 0.864
0.4277 1 0.865
0.4285 1 0.866
0.4294 1 0.867
0.4302 1 0.868
0.4311 1 0.869
0.4319 1 0.87
0.4327 1 0.871
0.4336 1 0.872
0.4344 1 0.873
0.4352 1 0.874
0.4361 1 0.875
0.4369 1 0.876
0.4377 1 0.877
0.4385 1 0.878
0.4394 1 0.879
0.4402 1 0.88
0.441 1 0.881
0.4418 1 0.882
0.4426 1 0.883
0.4434 1 0.884
0.4442 1 0.885
0.445 1 0.886
0.4458 1 0.887
0.4466 1 0.888
0.4474 1 0.889
0.4482 1 0.89
0.449 1 0.891
0.4498 1 0.892
0.4506 1 0.893
0.4514 1

In [None]:

## here we use the snapshots to compare error in intersect estimates

largestSnapshotSize = max(snapshots.keys())
largestSnapshot = snapshots[largestSnapshotSize]

print("intersecting with the largest snapshot with %d entries\n" % largestSnapshotSize)

intersectErrPcts = []
for k,v in snapshots.items():
    intersect = largestSnapshot.intersection(v)
    intersectErrPcts.append(intersect.errPct(k))
    print(k, intersect.errPct(k), intersect.dumps())





In [22]:
r = RoaringBitmap()  



<built-in method union of roaringbitmap.RoaringBitmap object at 0x106db9810> <built-in method union of roaringbitmap.RoaringBitmap object at 0x106db9810>
