Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Bump to 2.0. Fix ScalableBloomFilter and make the tests resilient to …

…FP error.
  • Loading branch information...
commit 64e42c8f1e4e739b63196708d89b4fa126020856 1 parent dcb8e49
@etrepum etrepum authored
View
10 CHANGES.txt
@@ -1,3 +1,11 @@
+Changes in 2.0
+==============
+Made major corrections to the algorithms for both BloomFilter and
+ScalableBloomFilter. Not numerically compatible with serialized
+representations of filters from previous versions. Specifically,
+BloomFilter was more accurate than requested and ScalableBloomFilter
+was much less accurate than requested.
+
Changes in 1.1
==============
-Added copy, intersection and union functions to BloomFilter
+Added copy, intersection and union functions to BloomFilter
View
9 README.txt
@@ -27,7 +27,7 @@ True
>>> f = BloomFilter(capacity=1000, error_rate=0.001)
>>> for i in xrange(0, f.capacity):
... _ = f.add(i)
->>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True
>>> from pybloom import ScalableBloomFilter
@@ -36,8 +36,9 @@ True
>>> for i in xrange(0, count):
... _ = sbf.add(i)
...
->>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True
-# len(sbf) may not equal the entire input length. 0.006% error is well
-# below the default 0.1% error threshold
+# len(sbf) may not equal the entire input length. 0.01% error is well
+# below the default 0.1% error threshold. As the capacity goes up, the
+# error will approach 0.1%.
View
3  pybloom/__init__.py
@@ -1,5 +1,4 @@
"""pybloom
-
+
"""
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__
-
View
77 pybloom/benchmarks.py
@@ -5,41 +5,44 @@
from pybloom import BloomFilter
import bitarray, math, time
-def main():
- request_error_rate = 0.01
- f = BloomFilter(capacity=200000, error_rate=request_error_rate)
- start = time.time()
- for i in xrange(0, f.capacity):
- f.add(i, skip_check=True)
- end = time.time()
- print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end-start, f.capacity/(end-start))
- oneBits = f.bitarray.count(True)
- zeroBits = f.bitarray.count(False)
- #print "Number of 1 bits:", oneBits
- #print "Number of 0 bits:", zeroBits
- print "Number of Filter Bits:", f.num_bits
- print "Number of slices:", f.num_slices
- print "Bits per slice:", f.bits_per_slice
- print "------"
- print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits) )
- # Look for false positives and measure the actual fp rate
- trials = f.capacity
- fp = 0
- start = time.time()
- for i in xrange(f.capacity, f.capacity+trials):
- if i in f:
- fp += 1
- end = time.time()
- print "{:5.3f} seconds to check false positives, {:10.2f} checks/second".format(end-start, trials/(end-start))
- print "Requested FP rate: {:2.4f}".format( request_error_rate )
- print "Experimental false positive rate: {:2.4f}".format( fp / float(trials))
- # Compute theoretical fp max (Goel/Gupta)
- k = f.num_slices
- m = f.num_bits
- n = f.capacity
- fp_theory = math.pow((1 - math.exp(-k * (n+0.5)/(m-1))), k)
- print "Projected FP rate (Goel/Gupta): {:2.6f}".format( fp_theory )
+def main(capacity=100000, request_error_rate=0.1):
+ f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
+ assert (capacity == f.capacity)
+ start = time.time()
+ for i in xrange(0, f.capacity):
+ f.add(i, skip_check=True)
+ end = time.time()
+ print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
+ end - start, f.capacity / (end - start))
+ oneBits = f.bitarray.count(True)
+ zeroBits = f.bitarray.count(False)
+ #print "Number of 1 bits:", oneBits
+ #print "Number of 0 bits:", zeroBits
+ print "Number of Filter Bits:", f.num_bits
+ print "Number of slices:", f.num_slices
+ print "Bits per slice:", f.bits_per_slice
+ print "------"
+ print "Fraction of 1 bits at capacity: {:5.3f}".format(
+ oneBits / float(f.num_bits))
+ # Look for false positives and measure the actual fp rate
+ trials = f.capacity
+ fp = 0
+ start = time.time()
+ for i in xrange(f.capacity, f.capacity + trials + 1):

The original code was correct:

for i in xrange(f.capacity, f.capacity + trials):

The +1 here is incorrect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ if i in f:
+ fp += 1
+ end = time.time()
+ print ("{:5.3f} seconds to check false positives, "
+ "{:10.2f} checks/second".format(end - start, trials / (end - start)))
+ print "Requested FP rate: {:2.4f}".format(request_error_rate)
+ print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
+ # Compute theoretical fp max (Goel/Gupta)
+ k = f.num_slices
+ m = f.num_bits
+ n = f.capacity
+ fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
+ print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)
-if __name__ == '__main__' :
- status = main()
- sys.exit(status)
+if __name__ == '__main__' :
+ status = main()
+ sys.exit(status)
View
28 pybloom/pybloom.py
@@ -16,7 +16,7 @@
False
>>> len(f) <= f.capacity
True
- >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
+ >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True
>>> from pybloom import ScalableBloomFilter
@@ -29,7 +29,7 @@
True
>>> len(sbf) <= count
True
- >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
+ >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True
"""
@@ -42,8 +42,8 @@
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4')
-__version__ = '1.1'
-__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
+__version__ = '2.0'
+__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>"
@@ -111,12 +111,13 @@ def __init__(self, capacity, error_rate=0.001):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
- # given M = num_bits, k = num_slices, p = error_rate, n = capacity
+ # given M = num_bits, k = num_slices, P = error_rate, n = capacity
+ # k = log2(1/P)
# solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
- num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
+ num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
bits_per_slice = int(math.ceil(
(capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
@@ -337,13 +338,18 @@ def add(self, key):
"""
if key in self:
return True
- filter = self.filters[-1] if self.filters else None
- if filter is None or filter.count >= filter.capacity:
- num_filters = len(self.filters)
+ if not self.filters:
filter = BloomFilter(
- capacity=self.initial_capacity * (self.scale ** num_filters),
- error_rate=self.error_rate * (self.ratio ** num_filters))
+ capacity=self.initial_capacity,
+ error_rate=self.error_rate * (1.0 - self.ratio))
self.filters.append(filter)
+ else:
+ filter = self.filters[-1]
+ if filter.count >= filter.capacity:
+ filter = BloomFilter(
+ capacity=filter.capacity * self.scale,
+ error_rate=filter.error_rate * self.ratio)
+ self.filters.append(filter)
filter.add(key, skip_check=True)
return False
View
6 setup.py
@@ -6,16 +6,16 @@
from setuptools import setup, find_packages, Extension
-VERSION = '1.0.3'
+VERSION = '2.0.0'
DESCRIPTION = "PyBloom: A Probabilistic data structure"
LONG_DESCRIPTION = """
pybloom is a Python implementation of the bloom filter probabilistic data
-structure. The module also provides a Scalable Bloom Filter that allows a
+structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size.
"""
CLASSIFIERS = filter(None, map(str.strip,
-"""
+"""
Intended Audience :: Developers
License :: OSI Approved :: MIT License
Programming Language :: Python
@glangford

The original code was correct:

for i in xrange(f.capacity, f.capacity + trials):

The +1 here is incorrect.

Please sign in to comment.
Something went wrong with that request. Please try again.