Skip to content

Commit

Permalink
Import, fix, cleanup, extend.
Browse files Browse the repository at this point in the history
  • Loading branch information
igrigorik committed Dec 27, 2008
0 parents commit dffe7c3
Show file tree
Hide file tree
Showing 9 changed files with 516 additions and 0 deletions.
51 changes: 51 additions & 0 deletions README.rdoc
@@ -0,0 +1,51 @@
= BloomFilter

Scalable Bloom Filter implemented in Ruby.

Bloom filter is a space-efficient probabilistic data structure that is used to
test whether an element is a member of a set. False positives are possible, but
false negatives are not. For more detail: http://en.wikipedia.org/wiki/Bloom_filter

== Implementation

Instead of using k different hash functions, this implementation seeds the CRC32 hash
with k different initial values (0, 1, ..., k-1). This may or may not give you a good
distribution, it all depends on the data.

== Example

require 'bloomfilter'

# M (size of bit array)
# K (number of hash functions)
# R (random seed) 100000000, k=4, random seed=1

# M, K, R
bf = BloomFilter.new(10, 2, 1)
bf.insert("test")
bf.include?("test")
=> true
bf.include?("test2")
=> false
bf.insert("test2")
bf.include?("test2")
=> true
bf.stats
Number of filter bits (m): 10
Number of filter elements (n): 2
Number of filter hashes (k) : 2
Predicted false positive rate = 10.87%


== Configuring Bloom Filter

Performance of the Bloom filter depends on a number of variables:
- size of the bit array
- number of hash functions

To figure out the values for these parameters, refer to:
http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/

== Credits
Tatsuya Mori <valdzone@gmail.com> (Original: http://vald.x0.com/sb/)
Ilya Grigorik <ilya@igvita.com> (Fix, cleanup, update)
52 changes: 52 additions & 0 deletions Rakefile
@@ -0,0 +1,52 @@
require 'rake'
require 'rake/clean'
require 'rake/rdoctask'
require 'rake/gempackagetask'
require 'fileutils'
include FileUtils

# Default Rake task is compile
task :default => :compile

def make(makedir)
Dir.chdir(makedir) { sh 'make' }
end

def extconf(dir)
Dir.chdir(dir) { ruby "extconf.rb" }
end

def setup_extension(dir, extension)
ext = "ext/#{dir}"
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
ext_files = FileList[
"#{ext}/*.c",
"#{ext}/*.h",
"#{ext}/extconf.rb",
"#{ext}/Makefile",
"lib"
]

task "lib" do
directory "lib"
end

desc "Builds just the #{extension} extension"
task extension.to_sym => ["#{ext}/Makefile", ext_so ]

file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
extconf "#{ext}"
end

file ext_so => ext_files do
make "#{ext}"
cp ext_so, "lib"
end
end

setup_extension("", "sbloomfilter")

task :compile => [:sbloomfilter]

CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', '**/*.log', 'pkg']
CLEAN.include ['ext/Makefile']
54 changes: 54 additions & 0 deletions examples/bf.rb
@@ -0,0 +1,54 @@
#!/usr/bin/env ruby
require 'bitset'
require 'zlib'

#
# Pure ruby implementation of a Bloom filter, just for kicks
#

class BloomFilter

def initialize(max_entries, num_hashes, seed)
@num_hashes = num_hashes
@size = max_entries.to_i
@bitmap = BitSet.new(@size)
@__mask = BitSet.new(@size)
@seed = seed
end

def insert(key)
mask = make_mask(key)
@bitmap |= mask
end

def new?(key)
mask = make_mask(key)
return ((@bitmap & mask) != mask);
end

def make_mask(key)
@__mask.clear
0.upto(@num_hashes.to_i - 1) do |i|
hash = Zlib.crc32(key, i + @seed)
@__mask.set(hash % @size, 1)
end
return @__mask
end
end


def main
bf = BloomFilter.new(1000000, 4, 0)
num = 0
while line = ARGF.gets
data = line.chop

if bf.new_entry?(data)
num += 1
bf.insert(data)
end
end
print "#element = #{num}\n"
end

main
24 changes: 24 additions & 0 deletions examples/simple.rb
@@ -0,0 +1,24 @@
#!/usr/bin/env ruby
require 'bloomfilter'

WORDS = %w(duck penguin bear panda)
TEST = %w(penguin moose racooon)

# m = 100, k = 4, seed = 1
bf = BloomFilter.new(100, 4, 1)

WORDS.each { |w| bf.insert(w) }
TEST.each do |w|
puts "#{w}: #{bf.include?(w)}"
end

bf.stats

# penguin: true
# moose: false
# racooon: false
#
# Number of filter bits (m): 100
# Number of filter elements (n): 4
# Number of filter hashes (k) : 4
# Predicted false positive rate = 0.05%
32 changes: 32 additions & 0 deletions ext/crc32.c
@@ -0,0 +1,32 @@
/* simple CRC32 code */
/*
* Copyright 2005 Aris Adamantiadis
*
* This file is part of the SSH Library
*
* The SSH Library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
*
* The SSH Library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with the SSH Library; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
* MA 02111-1307, USA. */

#include "crc32.h"

unsigned int crc32(unsigned int crc, char *buf, int len) {
while (len > 0) {
crc = crc_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
--len;
++buf;
}
return crc;
}
78 changes: 78 additions & 0 deletions ext/crc32.h
@@ -0,0 +1,78 @@
/* simple CRC32 code */
/*
* Copyright 2005 Aris Adamantiadis
*
* This file is part of the SSH Library
*
* The SSH Library is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
*
* The SSH Library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with the SSH Library; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
* MA 02111-1307, USA. */

static unsigned int crc_table[] = {
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
0x2d02ef8dUL
};

unsigned int crc32(unsigned int crc, char *buf, int len);
4 changes: 4 additions & 0 deletions ext/extconf.rb
@@ -0,0 +1,4 @@
#!/usr/bin/env ruby
require "mkmf"

create_makefile("sbloomfilter")

0 comments on commit dffe7c3

Please sign in to comment.