Skip to content

Commit

Permalink
Bump to 2.0. Fix ScalableBloomFilter and make the tests resilient to …
Browse files Browse the repository at this point in the history
…FP error.
  • Loading branch information
etrepum committed Dec 26, 2012
1 parent dcb8e49 commit 64e42c8
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 58 deletions.
10 changes: 9 additions & 1 deletion CHANGES.txt
@@ -1,3 +1,11 @@
Changes in 2.0
==============
Made major corrections to the algorithms for both BloomFilter and
ScalableBloomFilter. Not numerically compatible with serialized
representations of filters from previous versions. Specifically,
BloomFilter was more accurate than requested and ScalableBloomFilter
was much less accurate than requested.

Changes in 1.1 Changes in 1.1
============== ==============
Added copy, intersection and union functions to BloomFilter Added copy, intersection and union functions to BloomFilter
9 changes: 5 additions & 4 deletions README.txt
Expand Up @@ -27,7 +27,7 @@ True
>>> f = BloomFilter(capacity=1000, error_rate=0.001) >>> f = BloomFilter(capacity=1000, error_rate=0.001)
>>> for i in xrange(0, f.capacity): >>> for i in xrange(0, f.capacity):
... _ = f.add(i) ... _ = f.add(i)
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True True


>>> from pybloom import ScalableBloomFilter >>> from pybloom import ScalableBloomFilter
Expand All @@ -36,8 +36,9 @@ True
>>> for i in xrange(0, count): >>> for i in xrange(0, count):
... _ = sbf.add(i) ... _ = sbf.add(i)
... ...
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True True


# len(sbf) may not equal the entire input length. 0.006% error is well # len(sbf) may not equal the entire input length. 0.01% error is well
# below the default 0.1% error threshold # below the default 0.1% error threshold. As the capacity goes up, the
# error will approach 0.1%.
3 changes: 1 addition & 2 deletions pybloom/__init__.py
@@ -1,5 +1,4 @@
"""pybloom """pybloom
""" """
from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__ from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__

77 changes: 40 additions & 37 deletions pybloom/benchmarks.py
Expand Up @@ -5,41 +5,44 @@
from pybloom import BloomFilter from pybloom import BloomFilter
import bitarray, math, time import bitarray, math, time


def main(): def main(capacity=100000, request_error_rate=0.1):
request_error_rate = 0.01 f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
f = BloomFilter(capacity=200000, error_rate=request_error_rate) assert (capacity == f.capacity)
start = time.time() start = time.time()
for i in xrange(0, f.capacity): for i in xrange(0, f.capacity):
f.add(i, skip_check=True) f.add(i, skip_check=True)
end = time.time() end = time.time()
print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( end-start, f.capacity/(end-start)) print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
oneBits = f.bitarray.count(True) end - start, f.capacity / (end - start))
zeroBits = f.bitarray.count(False) oneBits = f.bitarray.count(True)
#print "Number of 1 bits:", oneBits zeroBits = f.bitarray.count(False)
#print "Number of 0 bits:", zeroBits #print "Number of 1 bits:", oneBits
print "Number of Filter Bits:", f.num_bits #print "Number of 0 bits:", zeroBits
print "Number of slices:", f.num_slices print "Number of Filter Bits:", f.num_bits
print "Bits per slice:", f.bits_per_slice print "Number of slices:", f.num_slices
print "------" print "Bits per slice:", f.bits_per_slice
print "Fraction of 1 bits at capacity: {:5.3f}".format( oneBits / float(f.num_bits) ) print "------"
# Look for false positives and measure the actual fp rate print "Fraction of 1 bits at capacity: {:5.3f}".format(
trials = f.capacity oneBits / float(f.num_bits))
fp = 0 # Look for false positives and measure the actual fp rate
start = time.time() trials = f.capacity
for i in xrange(f.capacity, f.capacity+trials): fp = 0
if i in f: start = time.time()
fp += 1 for i in xrange(f.capacity, f.capacity + trials + 1):

This comment has been minimized.

Copy link
@glangford

glangford Dec 31, 2012

Contributor

The original code was correct:

for i in xrange(f.capacity, f.capacity + trials):

The +1 here is incorrect.

end = time.time() if i in f:
print "{:5.3f} seconds to check false positives, {:10.2f} checks/second".format(end-start, trials/(end-start)) fp += 1
print "Requested FP rate: {:2.4f}".format( request_error_rate ) end = time.time()
print "Experimental false positive rate: {:2.4f}".format( fp / float(trials)) print ("{:5.3f} seconds to check false positives, "
# Compute theoretical fp max (Goel/Gupta) "{:10.2f} checks/second".format(end - start, trials / (end - start)))
k = f.num_slices print "Requested FP rate: {:2.4f}".format(request_error_rate)
m = f.num_bits print "Experimental false positive rate: {:2.4f}".format(fp / float(trials))
n = f.capacity # Compute theoretical fp max (Goel/Gupta)
fp_theory = math.pow((1 - math.exp(-k * (n+0.5)/(m-1))), k) k = f.num_slices
print "Projected FP rate (Goel/Gupta): {:2.6f}".format( fp_theory ) m = f.num_bits
n = f.capacity
fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)


if __name__ == '__main__' : if __name__ == '__main__' :
status = main() status = main()
sys.exit(status) sys.exit(status)
28 changes: 17 additions & 11 deletions pybloom/pybloom.py
Expand Up @@ -16,7 +16,7 @@
False False
>>> len(f) <= f.capacity >>> len(f) <= f.capacity
True True
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True True
>>> from pybloom import ScalableBloomFilter >>> from pybloom import ScalableBloomFilter
Expand All @@ -29,7 +29,7 @@
True True
>>> len(sbf) <= count >>> len(sbf) <= count
True True
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
True True
""" """
Expand All @@ -42,8 +42,8 @@
except ImportError: except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4') raise ImportError('pybloom requires bitarray >= 0.3.4')


__version__ = '1.1' __version__ = '2.0'
__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\ __author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\ Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>" Alex Brasetvik <alex@brasetvik.com>"


Expand Down Expand Up @@ -111,12 +111,13 @@ def __init__(self, capacity, error_rate=0.001):
raise ValueError("Error_Rate must be between 0 and 1.") raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0: if not capacity > 0:
raise ValueError("Capacity must be > 0") raise ValueError("Capacity must be > 0")
# given M = num_bits, k = num_slices, p = error_rate, n = capacity # given M = num_bits, k = num_slices, P = error_rate, n = capacity
# k = log2(1/P)
# solving for m = bits_per_slice # solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P))) # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
num_slices = int(math.ceil(math.log(1 / error_rate, 2))) num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
bits_per_slice = int(math.ceil( bits_per_slice = int(math.ceil(
(capacity * abs(math.log(error_rate))) / (capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2)))) (num_slices * (math.log(2) ** 2))))
Expand Down Expand Up @@ -337,13 +338,18 @@ def add(self, key):
""" """
if key in self: if key in self:
return True return True
filter = self.filters[-1] if self.filters else None if not self.filters:
if filter is None or filter.count >= filter.capacity:
num_filters = len(self.filters)
filter = BloomFilter( filter = BloomFilter(
capacity=self.initial_capacity * (self.scale ** num_filters), capacity=self.initial_capacity,
error_rate=self.error_rate * (self.ratio ** num_filters)) error_rate=self.error_rate * (1.0 - self.ratio))
self.filters.append(filter) self.filters.append(filter)
else:
filter = self.filters[-1]
if filter.count >= filter.capacity:
filter = BloomFilter(
capacity=filter.capacity * self.scale,
error_rate=filter.error_rate * self.ratio)
self.filters.append(filter)
filter.add(key, skip_check=True) filter.add(key, skip_check=True)
return False return False


Expand Down
6 changes: 3 additions & 3 deletions setup.py
Expand Up @@ -6,16 +6,16 @@


from setuptools import setup, find_packages, Extension from setuptools import setup, find_packages, Extension


VERSION = '1.0.3' VERSION = '2.0.0'
DESCRIPTION = "PyBloom: A Probabilistic data structure" DESCRIPTION = "PyBloom: A Probabilistic data structure"
LONG_DESCRIPTION = """ LONG_DESCRIPTION = """
pybloom is a Python implementation of the bloom filter probabilistic data pybloom is a Python implementation of the bloom filter probabilistic data
structure. The module also provides a Scalable Bloom Filter that allows a structure. The module also provides a Scalable Bloom Filter that allows a
bloom filter to grow without knowing the original set size. bloom filter to grow without knowing the original set size.
""" """


CLASSIFIERS = filter(None, map(str.strip, CLASSIFIERS = filter(None, map(str.strip,
""" """
Intended Audience :: Developers Intended Audience :: Developers
License :: OSI Approved :: MIT License License :: OSI Approved :: MIT License
Programming Language :: Python Programming Language :: Python
Expand Down

0 comments on commit 64e42c8

Please sign in to comment.