# Exercise 2: Bloom filter

In [4]:
# Implement a Bloom filter using bitarray
import bitarray as bit
from hashlib import sha3_256, sha256, blake2b

class BloomFilter:
    def __init__(self, size, hashes):
        self.size = size # Specify filter size
        self.hashes = hashes # Specify desired hash functions
        self.bitarray = bit.bitarray(size) # Initialize bitarray with defined size

    # Define hash functions
    def my_hash(self, value):
        return int(sha256(value.lower().encode()).hexdigest(), 16) % self.size

    def my_hash2(self, value):
        return int(blake2b(value.lower().encode()).hexdigest(), 16) % self.size

    def my_hash3(self, value):
        return int(sha3_256(value.lower().encode()).hexdigest(), 16) % self.size 
    
    # Add 
    def add(self, value):
        for hash in self.hashes:
            hashed = hash(self, value) # Access hash functions as instance methods for flexibility to choose combinations
            index = hashed % self.size # Convert hashed value to index
            self.bitarray[index] = 1 # Set index position in bitarray to 1

    def lookup(self, value):
        for hash in self.hashes:
            hashed = hash(self, value) # Access hash functions as instance methods
            index = hashed % self.size # Convert hashed value to index
            if self.bitarray[index] == 0: # Check if index position in bitarray is occupied
                return False
        return True

In [8]:
# Test the Bloom filter
add_test = BloomFilter(size = 100, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2, BloomFilter.my_hash3])
add_test.add("Hello!")
print(add_test.lookup("Hello!"))   # True (potentially a false positive)
print(add_test.lookup("Goodbye!")) # False

True
False


In [10]:
# Store words.txt into Bloom filter (assuming all words are correctly spelled)
# Note: To avoid hash collisions, the absolute minimum for a corpus of 500,000 words with 3 hash functions is 150,000 bits.
# For a starting point, at least 1,000,000 bits is recommended; 10,000,000 bits to be safe.
words = BloomFilter(size = 10_000_000, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2, BloomFilter.my_hash3])

with open('words.txt') as file:
    # Read word list one at a time
    for count, line in enumerate(file):
        word = line.strip()
        words.add(word)

# Test the filter
print(words.lookup("cb&b634"))       # False
print(words.lookup("computational")) # True
print(words.lookup("methods"))       # True
print(words.lookup("for"))           # True
print(words.lookup("informatics"))   # True

False
True
True
True
True


In [16]:
# Bloom filter spell checker
# Note: May return more false positives (suggesting a word not in the list) with smaller filters and fewer hash functions.
def spell_checker(word):
    # If the word is in the Bloom filter, it is correctly spelled
    if words.lookup(word):
        return(f"'{word}' is spelled correctly.")
    
    # If not, generate all possible single-letter substitutions and test them against Bloom filter
    word_suggestions = []
    for i in range(len(word)):
        for char in 'abcdefghijklmnopqrstuvwxyz':
            candidate = word[:i] + char + word[i+1:] # Slice word to test every letter at every character position
            if words.lookup(candidate):
                word_suggestions.append(candidate)
                
    # Return candidate words
    return word_suggestions

print(spell_checker("bruh"))
print(spell_checker("moment"))

False
True


In [None]:
# Plot filter size against hash function choice (first, first two, all three)
# Vary size by log scale, perhaps with 100 points