From 64e42c8f1e4e739b63196708d89b4fa126020856 Mon Sep 17 00:00:00 2001 From: Bob Ippolito Date: Wed, 26 Dec 2012 12:23:24 -0800 Subject: [PATCH] Bump to 2.0. Fix ScalableBloomFilter and make the tests resilient to FP error. --- CHANGES.txt | 10 +++++- README.txt | 9 ++--- pybloom/__init__.py | 3 +- pybloom/benchmarks.py | 77 ++++++++++++++++++++++--------------------- pybloom/pybloom.py | 28 +++++++++------- setup.py | 6 ++-- 6 files changed, 75 insertions(+), 58 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 44a24e2..4a4d5da 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,11 @@ +Changes in 2.0 +============== +Made major corrections to the algorithms for both BloomFilter and +ScalableBloomFilter. Not numerically compatible with serialized +representations of filters from previous versions. Specifically, +BloomFilter was more accurate than requested and ScalableBloomFilter +was much less accurate than requested. + Changes in 1.1 ============== -Added copy, intersection and union functions to BloomFilter \ No newline at end of file +Added copy, intersection and union functions to BloomFilter diff --git a/README.txt b/README.txt index 46c3964..6d31837 100644 --- a/README.txt +++ b/README.txt @@ -27,7 +27,7 @@ True >>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> for i in xrange(0, f.capacity): ... _ = f.add(i) ->>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate +>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 True >>> from pybloom import ScalableBloomFilter @@ -36,8 +36,9 @@ True >>> for i in xrange(0, count): ... _ = sbf.add(i) ... ->>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate +>>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 True -# len(sbf) may not equal the entire input length. 0.006% error is well -# below the default 0.1% error threshold +# len(sbf) may not equal the entire input length. 0.01% error is well +# below the default 0.1% error threshold. As the capacity goes up, the +# error will approach 0.1%. diff --git a/pybloom/__init__.py b/pybloom/__init__.py index 2c1b7ea..b533952 100644 --- a/pybloom/__init__.py +++ b/pybloom/__init__.py @@ -1,5 +1,4 @@ """pybloom - + """ from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__ - \ No newline at end of file diff --git a/pybloom/benchmarks.py b/pybloom/benchmarks.py index 6590061..aa224a8 100755 --- a/pybloom/benchmarks.py +++ b/pybloom/benchmarks.py @@ -5,41 +5,44 @@ from pybloom import BloomFilter import bitarray, math, time -def main(): - request_error_rate = 0.01 - f = BloomFilter(capacity=200000, error_rate=request_error_rate) - start = time.time() - for i in xrange(0, f.capacity): - f.add(i, skip_check=True) - end = time.time() - print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end-start, f.capacity/(end-start)) - oneBits = f.bitarray.count(True) - zeroBits = f.bitarray.count(False) - #print "Number of 1 bits:", oneBits - #print "Number of 0 bits:", zeroBits - print "Number of Filter Bits:", f.num_bits - print "Number of slices:", f.num_slices - print "Bits per slice:", f.bits_per_slice - print "------" - print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits) ) - # Look for false positives and measure the actual fp rate - trials = f.capacity - fp = 0 - start = time.time() - for i in xrange(f.capacity, f.capacity+trials): - if i in f: - fp += 1 - end = time.time() - print "{:5.3f} seconds to check false positives, {:10.2f} checks/second".format(end-start, trials/(end-start)) - print "Requested FP rate: {:2.4f}".format( request_error_rate ) - print "Experimental false positive rate: {:2.4f}".format( fp / float(trials)) - # Compute theoretical fp max (Goel/Gupta) - k = f.num_slices - m = f.num_bits - n = f.capacity - fp_theory = math.pow((1 - math.exp(-k * (n+0.5)/(m-1))), k) - print "Projected FP rate (Goel/Gupta): {:2.6f}".format( fp_theory ) +def main(capacity=100000, request_error_rate=0.1): + f = BloomFilter(capacity=capacity, error_rate=request_error_rate) + assert (capacity == f.capacity) + start = time.time() + for i in xrange(0, f.capacity): + f.add(i, skip_check=True) + end = time.time() + print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( + end - start, f.capacity / (end - start)) + oneBits = f.bitarray.count(True) + zeroBits = f.bitarray.count(False) + #print "Number of 1 bits:", oneBits + #print "Number of 0 bits:", zeroBits + print "Number of Filter Bits:", f.num_bits + print "Number of slices:", f.num_slices + print "Bits per slice:", f.bits_per_slice + print "------" + print "Fraction of 1 bits at capacity: {:5.3f}".format( + oneBits / float(f.num_bits)) + # Look for false positives and measure the actual fp rate + trials = f.capacity + fp = 0 + start = time.time() + for i in xrange(f.capacity, f.capacity + trials + 1): + if i in f: + fp += 1 + end = time.time() + print ("{:5.3f} seconds to check false positives, " + "{:10.2f} checks/second".format(end - start, trials / (end - start))) + print "Requested FP rate: {:2.4f}".format(request_error_rate) + print "Experimental false positive rate: {:2.4f}".format(fp / float(trials)) + # Compute theoretical fp max (Goel/Gupta) + k = f.num_slices + m = f.num_bits + n = f.capacity + fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) + print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory) -if __name__ == '__main__' : - status = main() - sys.exit(status) +if __name__ == '__main__' : + status = main() + sys.exit(status) diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py index 1bacbd1..2383ac5 100644 --- a/pybloom/pybloom.py +++ b/pybloom/pybloom.py @@ -16,7 +16,7 @@ False >>> len(f) <= f.capacity True - >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate + >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 True >>> from pybloom import ScalableBloomFilter @@ -29,7 +29,7 @@ True >>> len(sbf) <= count True - >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate + >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 True """ @@ -42,8 +42,8 @@ except ImportError: raise ImportError('pybloom requires bitarray >= 0.3.4') -__version__ = '1.1' -__author__ = "Jay Baird , Bob Ippolito ,\ +__version__ = '2.0' +__author__ = "Jay Baird , Bob Ippolito ,\ Marius Eriksen ,\ Alex Brasetvik " @@ -111,12 +111,13 @@ def __init__(self, capacity, error_rate=0.001): raise ValueError("Error_Rate must be between 0 and 1.") if not capacity > 0: raise ValueError("Capacity must be > 0") - # given M = num_bits, k = num_slices, p = error_rate, n = capacity + # given M = num_bits, k = num_slices, P = error_rate, n = capacity + # k = log2(1/P) # solving for m = bits_per_slice # n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) - num_slices = int(math.ceil(math.log(1 / error_rate, 2))) + num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) bits_per_slice = int(math.ceil( (capacity * abs(math.log(error_rate))) / (num_slices * (math.log(2) ** 2)))) @@ -337,13 +338,18 @@ def add(self, key): """ if key in self: return True - filter = self.filters[-1] if self.filters else None - if filter is None or filter.count >= filter.capacity: - num_filters = len(self.filters) + if not self.filters: filter = BloomFilter( - capacity=self.initial_capacity * (self.scale ** num_filters), - error_rate=self.error_rate * (self.ratio ** num_filters)) + capacity=self.initial_capacity, + error_rate=self.error_rate * (1.0 - self.ratio)) self.filters.append(filter) + else: + filter = self.filters[-1] + if filter.count >= filter.capacity: + filter = BloomFilter( + capacity=filter.capacity * self.scale, + error_rate=filter.error_rate * self.ratio) + self.filters.append(filter) filter.add(key, skip_check=True) return False diff --git a/setup.py b/setup.py index 40ac009..e9655a3 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ from setuptools import setup, find_packages, Extension -VERSION = '1.0.3' +VERSION = '2.0.0' DESCRIPTION = "PyBloom: A Probabilistic data structure" LONG_DESCRIPTION = """ pybloom is a Python implementation of the bloom filter probabilistic data -structure. The module also provides a Scalable Bloom Filter that allows a +structure. The module also provides a Scalable Bloom Filter that allows a bloom filter to grow without knowing the original set size. """ CLASSIFIERS = filter(None, map(str.strip, -""" +""" Intended Audience :: Developers License :: OSI Approved :: MIT License Programming Language :: Python