Bump to 2.0. Fix ScalableBloomFilter and make the tests resilient to …

…FP error.
jaybaird · Dec 26, 2012 · 64e42c8 · glangford · Dec 31, 2012 · 64e42c8
1 parent dcb8e49
commit 64e42c8
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 58 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,11 @@
+Changes in 2.0
+==============
+Made major corrections to the algorithms for both BloomFilter and
+ScalableBloomFilter. Not numerically compatible with serialized
+representations of filters from previous versions. Specifically,
+BloomFilter was more accurate than requested and ScalableBloomFilter
+was much less accurate than requested.
+
 Changes in 1.1
 ==============
-Added copy, intersection and union functions to BloomFilter
+Added copy, intersection and union functions to BloomFilter
diff --git a/README.txt b/README.txt
@@ -27,7 +27,7 @@ True
 >>> f = BloomFilter(capacity=1000, error_rate=0.001)
 >>> for i in xrange(0, f.capacity):
 ...     _ = f.add(i)
->>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
 True
 
 >>> from pybloom import ScalableBloomFilter
@@ -36,8 +36,9 @@ True
 >>> for i in xrange(0, count):
 ...     _ = sbf.add(i)
 ...
->>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
 True
 
-# len(sbf) may not equal the entire input length. 0.006% error is well
+# len(sbf) may not equal the entire input length. 0.01% error is well
-# below the default 0.1% error threshold
+# below the default 0.1% error threshold. As the capacity goes up, the
+# error will approach 0.1%.
diff --git a/pybloom/__init__.py b/pybloom/__init__.py
@@ -1,5 +1,4 @@
 """pybloom
- 
+
 """
 from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
-
diff --git a/pybloom/benchmarks.py b/pybloom/benchmarks.py
@@ -5,41 +5,44 @@
 from pybloom import BloomFilter
 import bitarray, math, time
 
-def main():
+def main(capacity=100000, request_error_rate=0.1):
-	request_error_rate = 0.01
+    f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
-	f = BloomFilter(capacity=200000, error_rate=request_error_rate)
+    assert (capacity == f.capacity)
-	start = time.time()
+    start = time.time()
-	for i in xrange(0, f.capacity):
+    for i in xrange(0, f.capacity):
-		f.add(i, skip_check=True)
+        f.add(i, skip_check=True)
-	end = time.time()
+    end = time.time()
-	print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end-start, f.capacity/(end-start))
+    print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
-	oneBits = f.bitarray.count(True)
+            end - start, f.capacity / (end - start))
-	zeroBits = f.bitarray.count(False)
+    oneBits = f.bitarray.count(True)
-	#print "Number of 1 bits:", oneBits
+    zeroBits = f.bitarray.count(False)
-	#print "Number of 0 bits:", zeroBits
+    #print "Number of 1 bits:", oneBits
-	print "Number of Filter Bits:", f.num_bits
+    #print "Number of 0 bits:", zeroBits
-	print "Number of slices:", f.num_slices
+    print "Number of Filter Bits:", f.num_bits
-	print "Bits per slice:", f.bits_per_slice
+    print "Number of slices:", f.num_slices
-	print "------"
+    print "Bits per slice:", f.bits_per_slice
-	print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits) )
+    print "------"
-	# Look for false positives and measure the actual fp rate
+    print "Fraction of 1 bits at capacity: {:5.3f}".format(
-	trials = f.capacity
+            oneBits / float(f.num_bits))
-	fp = 0
+    # Look for false positives and measure the actual fp rate
-	start = time.time()
+    trials = f.capacity
-	for i in xrange(f.capacity, f.capacity+trials):
+    fp = 0
-		if i in f:
+    start = time.time()
-			fp += 1
+    for i in xrange(f.capacity, f.capacity + trials + 1):
-	end = time.time()
+        if i in f:
-	print "{:5.3f} seconds to check false positives, {:10.2f} checks/second".format(end-start, trials/(end-start))
+            fp += 1
-	print "Requested FP rate: {:2.4f}".format( request_error_rate )
+    end = time.time()
-	print "Experimental false positive rate: {:2.4f}".format( fp / float(trials))
+    print ("{:5.3f} seconds to check false positives, "
-	# Compute theoretical fp max (Goel/Gupta)
+           "{:10.2f} checks/second".format(end - start, trials / (end - start)))
-	k = f.num_slices
+    print "Requested FP rate: {:2.4f}".format(request_error_rate)
-	m = f.num_bits
+    print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
-	n = f.capacity
+    # Compute theoretical fp max (Goel/Gupta)
-	fp_theory = math.pow((1 - math.exp(-k * (n+0.5)/(m-1))), k)
+    k = f.num_slices
-	print "Projected FP rate (Goel/Gupta): {:2.6f}".format( fp_theory )
+    m = f.num_bits
+    n = f.capacity
+    fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
+    print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)
 
-if __name__ == '__main__' : 
+if __name__ == '__main__' :
-	status = main()
+    status = main()
-	sys.exit(status)
+    sys.exit(status)
diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py
@@ -16,7 +16,7 @@
     False
     >>> len(f) <= f.capacity
     True
-    >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+    >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
     True
 
     >>> from pybloom import ScalableBloomFilter
@@ -29,7 +29,7 @@
     True
     >>> len(sbf) <= count
     True
-    >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+    >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
     True
 
 """
@@ -42,8 +42,8 @@
 except ImportError:
     raise ImportError('pybloom requires bitarray >= 0.3.4')
 
-__version__ = '1.1'
+__version__ = '2.0'
-__author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
+__author__  = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
                Marius Eriksen <marius@monkey.org>,\
                Alex Brasetvik <alex@brasetvik.com>"
 
@@ -111,12 +111,13 @@ def __init__(self, capacity, error_rate=0.001):
             raise ValueError("Error_Rate must be between 0 and 1.")
         if not capacity > 0:
             raise ValueError("Capacity must be > 0")
-        # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
+        #       k = log2(1/P)
         # solving for m = bits_per_slice
         # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
         # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
         # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
-        num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
+        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
         bits_per_slice = int(math.ceil(
             (capacity * abs(math.log(error_rate))) /
             (num_slices * (math.log(2) ** 2))))
@@ -337,13 +338,18 @@ def add(self, key):
         """
         if key in self:
             return True
-        filter = self.filters[-1] if self.filters else None
+        if not self.filters:
-        if filter is None or filter.count >= filter.capacity:
-            num_filters = len(self.filters)
             filter = BloomFilter(
-                capacity=self.initial_capacity * (self.scale ** num_filters),
+                capacity=self.initial_capacity,
-                error_rate=self.error_rate * (self.ratio ** num_filters))
+                error_rate=self.error_rate * (1.0 - self.ratio))
             self.filters.append(filter)
+        else:
+            filter = self.filters[-1]
+            if filter.count >= filter.capacity:
+                filter = BloomFilter(
+                    capacity=filter.capacity * self.scale,
+                    error_rate=filter.error_rate * self.ratio)
+                self.filters.append(filter)
         filter.add(key, skip_check=True)
         return False
 

diff --git a/setup.py b/setup.py
@@ -6,16 +6,16 @@
 
 from setuptools import setup, find_packages, Extension
 
-VERSION = '1.0.3'
+VERSION = '2.0.0'
 DESCRIPTION = "PyBloom: A Probabilistic data structure"
 LONG_DESCRIPTION = """
 pybloom is a Python implementation of the bloom filter probabilistic data
-structure. The module also provides a Scalable Bloom Filter that allows a 
+structure. The module also provides a Scalable Bloom Filter that allows a
 bloom filter to grow without knowing the original set size.
 """
 
 CLASSIFIERS = filter(None, map(str.strip,
-"""                 
+"""
 Intended Audience :: Developers
 License :: OSI Approved :: MIT License
 Programming Language :: Python