Skip to content
Browse files

Bump to 2.0. Fix ScalableBloomFilter and make the tests resilient to …

…FP error.
  • Loading branch information...
1 parent dcb8e49 commit 64e42c8f1e4e739b63196708d89b4fa126020856 @etrepum etrepum committed Dec 26, 2012
Showing with 75 additions and 58 deletions.
  1. +9 −1 CHANGES.txt
  2. +5 −4 README.txt
  3. +1 −2 pybloom/__init__.py
  4. +40 −37 pybloom/benchmarks.py
  5. +17 −11 pybloom/pybloom.py
  6. +3 −3 setup.py
View
10 CHANGES.txt
@@ -1,3 +1,11 @@
+Changes in 2.0
+==============
+Made major corrections to the algorithms for both BloomFilter and
+ScalableBloomFilter. Not numerically compatible with serialized
+representations of filters from previous versions. Specifically,
+BloomFilter was more accurate than requested and ScalableBloomFilter
+was much less accurate than requested.
+
Changes in 1.1
==============
-Added copy, intersection and union functions to BloomFilter
+Added copy, intersection and union functions to BloomFilter
View
9 README.txt
@@ -27,7 +27,7 @@ True
>>> f = BloomFilter(capacity=1000, error_rate=0.001)
>>> for i in xrange(0, f.capacity):
... _ = f.add(i)
->>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True
>>> from pybloom import ScalableBloomFilter
@@ -36,8 +36,9 @@ True
>>> for i in xrange(0, count):
... _ = sbf.add(i)
...
->>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True
-# len(sbf) may not equal the entire input length. 0.006% error is well
-# below the default 0.1% error threshold
+# len(sbf) may not equal the entire input length. 0.01% error is well
+# below the default 0.1% error threshold. As the capacity goes up, the
+# error will approach 0.1%.
View
3 pybloom/__init__.py
@@ -1,5 +1,4 @@
"""pybloom
-
+
"""
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
-
View
77 pybloom/benchmarks.py
@@ -5,41 +5,44 @@
from pybloom import BloomFilter
import bitarray, math, time
-def main():
- request_error_rate = 0.01
- f = BloomFilter(capacity=200000, error_rate=request_error_rate)
- start = time.time()
- for i in xrange(0, f.capacity):
- f.add(i, skip_check=True)
- end = time.time()
- print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end-start, f.capacity/(end-start))
- oneBits = f.bitarray.count(True)
- zeroBits = f.bitarray.count(False)
- #print "Number of 1 bits:", oneBits
- #print "Number of 0 bits:", zeroBits
- print "Number of Filter Bits:", f.num_bits
- print "Number of slices:", f.num_slices
- print "Bits per slice:", f.bits_per_slice
- print "------"
- print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits) )
- # Look for false positives and measure the actual fp rate
- trials = f.capacity
- fp = 0
- start = time.time()
- for i in xrange(f.capacity, f.capacity+trials):
- if i in f:
- fp += 1
- end = time.time()
- print "{:5.3f} seconds to check false positives, {:10.2f} checks/second".format(end-start, trials/(end-start))
- print "Requested FP rate: {:2.4f}".format( request_error_rate )
- print "Experimental false positive rate: {:2.4f}".format( fp / float(trials))
- # Compute theoretical fp max (Goel/Gupta)
- k = f.num_slices
- m = f.num_bits
- n = f.capacity
- fp_theory = math.pow((1 - math.exp(-k * (n+0.5)/(m-1))), k)
- print "Projected FP rate (Goel/Gupta): {:2.6f}".format( fp_theory )
+def main(capacity=100000, request_error_rate=0.1):
+ f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
+ assert (capacity == f.capacity)
+ start = time.time()
+ for i in xrange(0, f.capacity):
+ f.add(i, skip_check=True)
+ end = time.time()
+ print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
+ end - start, f.capacity / (end - start))
+ oneBits = f.bitarray.count(True)
+ zeroBits = f.bitarray.count(False)
+ #print "Number of 1 bits:", oneBits
+ #print "Number of 0 bits:", zeroBits
+ print "Number of Filter Bits:", f.num_bits
+ print "Number of slices:", f.num_slices
+ print "Bits per slice:", f.bits_per_slice
+ print "------"
+ print "Fraction of 1 bits at capacity: {:5.3f}".format(
+ oneBits / float(f.num_bits))
+ # Look for false positives and measure the actual fp rate
+ trials = f.capacity
+ fp = 0
+ start = time.time()
+ for i in xrange(f.capacity, f.capacity + trials + 1):
@glangford
glangford added a note Dec 31, 2012

The original code was correct:

for i in xrange(f.capacity, f.capacity + trials):

The +1 here is incorrect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ if i in f:
+ fp += 1
+ end = time.time()
+ print ("{:5.3f} seconds to check false positives, "
+ "{:10.2f} checks/second".format(end - start, trials / (end - start)))
+ print "Requested FP rate: {:2.4f}".format(request_error_rate)
+ print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
+ # Compute theoretical fp max (Goel/Gupta)
+ k = f.num_slices
+ m = f.num_bits
+ n = f.capacity
+ fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
+ print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)
-if __name__ == '__main__' :
- status = main()
- sys.exit(status)
+if __name__ == '__main__' :
+ status = main()
+ sys.exit(status)
View
28 pybloom/pybloom.py
@@ -16,7 +16,7 @@
False
>>> len(f) <= f.capacity
True
- >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+ >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True
>>> from pybloom import ScalableBloomFilter
@@ -29,7 +29,7 @@
True
>>> len(sbf) <= count
True
- >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+ >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True
"""
@@ -42,8 +42,8 @@
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4')
-__version__ = '1.1'
-__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
+__version__ = '2.0'
+__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>"
@@ -111,12 +111,13 @@ def __init__(self, capacity, error_rate=0.001):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
- # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+ # given M = num_bits, k = num_slices, P = error_rate, n = capacity
+ # k = log2(1/P)
# solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
- num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
+ num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
bits_per_slice = int(math.ceil(
(capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
@@ -337,13 +338,18 @@ def add(self, key):
"""
if key in self:
return True
- filter = self.filters[-1] if self.filters else None
- if filter is None or filter.count >= filter.capacity:
- num_filters = len(self.filters)
+ if not self.filters:
filter = BloomFilter(
- capacity=self.initial_capacity * (self.scale ** num_filters),
- error_rate=self.error_rate * (self.ratio ** num_filters))
+ capacity=self.initial_capacity,
+ error_rate=self.error_rate * (1.0 - self.ratio))
self.filters.append(filter)
+ else:
+ filter = self.filters[-1]
+ if filter.count >= filter.capacity:
+ filter = BloomFilter(
+ capacity=filter.capacity * self.scale,
+ error_rate=filter.error_rate * self.ratio)
+ self.filters.append(filter)
filter.add(key, skip_check=True)
return False
View
6 setup.py
@@ -6,16 +6,16 @@
from setuptools import setup, find_packages, Extension
-VERSION = '1.0.3'
+VERSION = '2.0.0'
DESCRIPTION = "PyBloom: A Probabilistic data structure"
LONG_DESCRIPTION = """
pybloom is a Python implementation of the bloom filter probabilistic data
-structure. The module also provides a Scalable Bloom Filter that allows a
+structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
"""
CLASSIFIERS = filter(None, map(str.strip,
-"""
+"""
Intended Audience :: Developers
License :: OSI Approved :: MIT License
Programming Language :: Python

0 comments on commit 64e42c8

Please sign in to comment.
Something went wrong with that request. Please try again.