In [3]:
from math import pow
import random
import json

from roaringbitmap import RoaringBitmap

class RbmRoundRobyn:
    """A bitmap sketch with fixed max size via variable sampling"""    
    # TODO replace default initMaxAllowed with pythonic MAX_INT
    # TODO to be stict to remove chance of unitialized member vars
    # TODO having a version that requires everything we need for the results of intersection, union, etc
    def __init__(self, maxSamples, initMaxAllowed=pow(2,32)):        
        self.maxSamples = maxSamples
        self.initMaxAllowed = initMaxAllowed
        
        self.rbm = RoaringBitmap()  
        self.curMaxAllowed = initMaxAllowed
        self.numAdded = 0

    def copy(self):
        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.copy()
        result.curMaxAllowed = self.curMaxAllowed
        result.numAdded = self.numAdded
        return result
        
    def _checkSize(self):          
        # first, prune out anything beyond the max because they're not in the sample
        self._prune()

        # now, as long have more than the maxSamples number, we need to shrink our curMaxAllowed value and re-prune
        while len(self.rbm) > self.maxSamples:
            self.curMaxAllowed /= 2   # TODO consider shrinking by something other than half
            self._prune()

    def _prune(self):  
        if len(self.rbm) == 0:
            return
        while self.rbm.max() > self.curMaxAllowed:
            self.rbm.pop()
        return
                
    def add(self, i):        
        self.rbm.add(i)
        self._checkSize()        
        self.numAdded += 1
        return i
    
    def intersection(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.intersection(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = min(self.numAdded, other.numAdded) ## TODO make sure this makes sense
        
        result._checkSize()
        return result
    
    def union(self, other):        
        ## we need to ensure the instances have the same init values
        ## TODO verify that we can't relax this for either value somehow
        
        if self.maxSamples != other.maxSamples or self.initMaxAllowed != other.initMaxAllowed:
            raise RuntimeError("you cannot mix instances with different initial values")

        result = RbmRoundRobyn(maxSamples=self.maxSamples, initMaxAllowed=self.initMaxAllowed)
        
        result.rbm = self.rbm.union(other.rbm)
        result.curMaxAllowed = min(self.curMaxAllowed, other.curMaxAllowed)
        result.numAdded = self.numAdded + other.numAdded
        
        result._checkSize()
        return result
    
             
    def estimatedCardinality(self):
        return len(self.rbm)*(self.initMaxAllowed/self.curMaxAllowed)
    
    def curMax(self):
        return self.rbm.max()
    
    def dumps(self):
        state = {
            "estCard" : int(self.estimatedCardinality()),
            "numAdded" : self.numAdded,
            "curMax" : self.curMax(),                        
            "curMaxAllowed" : self.curMaxAllowed,                        
        }        
        return json.dumps(state)

    def errPct(self, target):
        precision = 3
        mult = pow(10,precision+1)
        return int(mult*(self.estimatedCardinality()-target)/target)/mult
        

print(RbmRoundRobyn)


<class '__main__.RbmRoundRobyn'>


In [10]:
import time
random.seed(time.time()*1000)

rbmrr = RbmRoundRobyn(maxSamples=10000)

getRandomInRange = lambda: random.randint(0,rbmrr.initMaxAllowed)  #TODO consider dealing with negatives
iterations = 20*1000*1000

snapshots = {}
NUM_SNAPSHOTS = 200
errHistogram = {} # TODO consider using defaultdict

for i in range(1,iterations+1):
    
    # if i % (iterations/1000) == 0: print("processing %d" % i)
        
    rbmrr.add(getRandomInRange())
    
    errPct = rbmrr.errPct(i)
    if errPct not in errHistogram:
        errHistogram[errPct] = 1
    else:
        errHistogram[errPct] += 1
            
    if (NUM_SNAPSHOTS > 0 and i % (iterations/NUM_SNAPSHOTS) == 0): 
        print(i, rbmrr.errPct(i), rbmrr.dumps())
        snapshots[i] = rbmrr.copy()



100000 -0.006 {"estCard": 99392, "numAdded": 100000, "curMax": 268434345, "curMaxAllowed": 268435456.0}
200000 -0.0155 {"estCard": 196896, "numAdded": 200000, "curMax": 134204519, "curMaxAllowed": 134217728.0}
300000 -0.013 {"estCard": 296096, "numAdded": 300000, "curMax": 134204519, "curMaxAllowed": 134217728.0}
400000 0.0027 {"estCard": 401088, "numAdded": 400000, "curMax": 67104630, "curMaxAllowed": 67108864.0}
500000 0.0053 {"estCard": 502656, "numAdded": 500000, "curMax": 67104630, "curMaxAllowed": 67108864.0}
600000 0.0086 {"estCard": 605184, "numAdded": 600000, "curMax": 67104630, "curMaxAllowed": 67108864.0}
700000 0.0037 {"estCard": 702592, "numAdded": 700000, "curMax": 33551625, "curMaxAllowed": 33554432.0}
800000 0.0104 {"estCard": 808320, "numAdded": 800000, "curMax": 33551625, "curMaxAllowed": 33554432.0}
900000 0.0127 {"estCard": 911488, "numAdded": 900000, "curMax": 33551625, "curMaxAllowed": 33554432.0}
1000000 0.0097 {"estCard": 1009792, "numAdded": 1000000, "curMax": 

8000000 0.0109 {"estCard": 8087552, "numAdded": 8000000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8100000 0.0108 {"estCard": 8187904, "numAdded": 8100000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8200000 0.0106 {"estCard": 8287232, "numAdded": 8200000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8300000 0.011 {"estCard": 8391680, "numAdded": 8300000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8400000 0.0129 {"estCard": 8508416, "numAdded": 8400000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8500000 0.0131 {"estCard": 8611840, "numAdded": 8500000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8600000 0.0109 {"estCard": 8693760, "numAdded": 8600000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8700000 0.0106 {"estCard": 8793088, "numAdded": 8700000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8800000 0.0096 {"estCard": 8885248, "numAdded": 8800000, "curMax": 4194294, "curMaxAllowed": 4194304.0}
8900000 0.0099 {"estCard": 8988672, "numAdded": 8900000, "curMax"

15800000 -0.0007 {"estCard": 15788032, "numAdded": 15800000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
15900000 -0.0015 {"estCard": 15876096, "numAdded": 15900000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16000000 -0.0012 {"estCard": 15980544, "numAdded": 16000000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16100000 -0.0013 {"estCard": 16078848, "numAdded": 16100000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16200000 -0.0012 {"estCard": 16179200, "numAdded": 16200000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16300000 0.0002 {"estCard": 16304128, "numAdded": 16300000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16400000 0.0 {"estCard": 16400384, "numAdded": 16400000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16500000 0.0005 {"estCard": 16508928, "numAdded": 16500000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16600000 0.0009 {"estCard": 16615424, "numAdded": 16600000, "curMax": 2097074, "curMaxAllowed": 2097152.0}
16700000 0.002 {"estCard": 16734208

In [None]:

for k,v in snapshots.items():
    intersect = rbmrr.intersection(v)
    print(k, rbmrr.numAdded, intersect.errPct(k), intersect.dumps())


In [17]:


absHist, totalCount = {}, 0
for errPct, count in errHistogram.items():
    totalCount += count
    if abs(errPct) not in absHist:
        absHist[abs(errPct)] =0

    absHist[abs(errPct)] += count
    

    
cumulativeCount = 0
for absErrPct, count in sorted(absHist.items()):
    cumulativeCount += count
    print(absErrPct, count, cumulativeCount/totalCount)
    
    
    
    

0.0 324306 0.0162153
0.0001 288742 0.0306524
0.0002 277472 0.044526
0.0003 282634 0.0586577
0.0004 292941 0.07330475
0.0005 273788 0.08699415
0.0006 251946 0.09959145
0.0007 212303 0.1102066
0.0008 194091 0.11991115
0.0009 179700 0.12889615
0.001 191146 0.13845345
0.0011 188308 0.14786885
0.0012 180020 0.15686985
0.0013 165197 0.1651297
0.0014 146027 0.17243105
0.0015 137616 0.17931185
0.0016 150890 0.18685635
0.0017 155121 0.1946124
0.0018 131356 0.2011802
0.0019 134588 0.2079096
0.002 137400 0.2147796
0.0021 164298 0.2229945
0.0022 145010 0.230245
0.0023 128372 0.2366636
0.0024 116093 0.24246825
0.0025 114873 0.2482119
0.0026 131280 0.2547759
0.0027 132210 0.2613864
0.0028 110264 0.2668996
0.0029 94797 0.27163945
0.003 90497 0.2761643
0.0031 85319 0.28043025
0.0032 81996 0.28453005
0.0033 73217 0.2881909
0.0034 74892 0.2919355
0.0035 83154 0.2960932
0.0036 101066 0.3011465
0.0037 122078 0.3072504
0.0038 137774 0.3141391
0.0039 135471 0.32091265
0.004 136254 0.32772535
0.0041 142064 0