# Exercise 2: Bloom filter

In [4]:
# Implement a Bloom filter using bitarray
import bitarray as bit
from hashlib import sha3_256, sha256, blake2b

class BloomFilter:
    def __init__(self, size, hashes):
        self.size = size # Specify filter size
        self.hashes = hashes # Specify desired hash functions
        self.bitarray = bit.bitarray(size) # Initialize bitarray with defined size

    # Define hash functions
    def my_hash(self, value):
        return int(sha256(value.lower().encode()).hexdigest(), 16) % self.size

    def my_hash2(self, value):
        return int(blake2b(value.lower().encode()).hexdigest(), 16) % self.size

    def my_hash3(self, value):
        return int(sha3_256(value.lower().encode()).hexdigest(), 16) % self.size 
    
    # Add a value to the Bloom filter
    def add(self, value):
        for hash in self.hashes:
            hashed = hash(self, value) # Access hash functions as instance methods for flexibility to choose combinations
            index = hashed % self.size # Convert hashed value to index
            self.bitarray[index] = 1 # Set index position in bitarray to 1

    # Check if a value is in the Bloom filter (may return false positives)
    def lookup(self, value):
        for hash in self.hashes:
            hashed = hash(self, value) # Access hash functions as instance methods
            index = hashed % self.size # Convert hashed value to index
            if self.bitarray[index] == 0: # Check if index position in bitarray is occupied
                return False
        return True

In [8]:
# Test the Bloom filter
add_test = BloomFilter(size = 100, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2, BloomFilter.my_hash3])
add_test.add("Hello!")
print(add_test.lookup("Hello!"))   # True (potentially a false positive)
print(add_test.lookup("Goodbye!")) # False

True
False


In [10]:
# Store words.txt into Bloom filter (assuming all words are correctly spelled)
# Note: To avoid hash collisions, the absolute minimum for a corpus of 500,000 words with 3 hash functions is 150,000 bits.
# For a starting point, at least 1,000,000 bits is recommended; 10,000,000 bits to be safe.
words = BloomFilter(size = 10_000_000, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2, BloomFilter.my_hash3])

with open("words.txt") as file:
    # Read word list one at a time
    for line in file:
        word = line.strip()
        words.add(word)

# Test the filter
print(words.lookup("cb&b634"))       # False
print(words.lookup("computational")) # True
print(words.lookup("methods"))       # True
print(words.lookup("for"))           # True
print(words.lookup("informatics"))   # True

False
True
True
True
True


In [45]:
# Bloom filter spell checker
# Note: May return more false positives (suggesting a word not in the list) with smaller filters and fewer hash functions.
def spell_checker(word, BloomFilter):
    # If the word is in the specified Bloom filter, it is correctly spelled
    if BloomFilter.lookup(word):
        return(f"'{word}' is spelled correctly.")
    
    # If not, generate all possible single-letter substitutions and test them against Bloom filter
    word_suggestions = []
    for i in range(len(word)):
        for letter in "abcdefghijklmnopqrstuvwxyz":
            candidate = word[:i] + letter + word[i+1:] # Slice word to test every letter at every character position
            if BloomFilter.lookup(candidate):
                word_suggestions.append(candidate)

    # Return candidate words
    return word_suggestions

print(spell_checker("bruh", words))
print(spell_checker("moment", words))

['pruh', 'bluh', 'brum', 'brut']
'moment' is spelled correctly.


In [64]:
# Test spell checker on typos.json
import json

def accuracy_checker(BloomFilter):
    # Load list of typos
    with open("typos.json") as file:
        typos = json.load(file) # Creates list of [typed_word, correct_word] pairs
    correct_count = 0
    
    # Check if correct word is produced by the specified Bloom filter and that it gives no more than 3 suggestions
    for typo in typos:
        if (typo[1] in spell_checker(typo[0], BloomFilter)) and (len(spell_checker(typo[0], BloomFilter)) <= 3):
            correct_count = correct_count + 1

    # Return spell checker accuracy
    return(correct_count / len(typos))

print("Suggestion accuracy:", round(accuracy_checker(words) * 100, 3), "%")


Suggestion accuracy: 45.8 %


In [62]:
# Plot filter size against hash function choice (first, first two, all three)
import numpy as np
import pandas as pd

sizes = list(np.linspace(start = 1e5, stop = 1e7, num = 10, dtype = int)) # Generate evenly spaced Bloom filter sizes
accuracies = []
for size in sizes:
    one_hash_filter = BloomFilter(size = size, hashes = [BloomFilter.my_hash])
    accuracies.append()

    two_hash_filter = BloomFilter(size = size, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2])
    three_hash_filter = BloomFilter(size = size, hashes = [BloomFilter.my_hash, BloomFilter.my_hash2, BloomFilter.my_hash3])

one_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})
two_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})
three_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})


array([ 100000.,  200000.,  300000.,  400000.,  500000.,  600000.,
        700000.,  800000.,  900000., 1000000., 1100000., 1200000.,
       1300000., 1400000., 1500000., 1600000., 1700000., 1800000.,
       1900000., 2000000., 2100000., 2200000., 2300000., 2400000.,
       2500000., 2600000., 2700000., 2800000., 2900000., 3000000.,
       3100000., 3200000., 3300000., 3400000., 3500000., 3600000.,
       3700000., 3800000., 3900000., 4000000., 4100000., 4200000.,
       4300000., 4400000., 4500000., 4600000., 4700000., 4800000.,
       4900000., 5000000., 5100000., 5200000., 5300000., 5400000.,
       5500000., 5600000., 5700000., 5800000., 5900000., 6000000.,
       6100000., 6200000., 6300000., 6400000., 6500000., 6600000.,
       6700000., 6800000., 6900000., 7000000., 7100000., 7200000.,
       7300000., 7400000., 7500000., 7600000., 7700000., 7800000.,
       7900000., 8000000., 8100000., 8200000., 8300000., 8400000.,
       8500000., 8600000., 8700000., 8800000., 8900000., 90000