# Exercise 2: Bloom filter

In [36]:
# Implement a Bloom filter using bitarray
import bitarray as bit
from hashlib import sha3_256, sha256, blake2b

class BloomFilter:
    def __init__(self, size, hashes):
        self.size = size # Specify filter size
        self.hashes = hashes # Specify desired hash functions
        self.bitarray = bit.bitarray(size) # Initialize bitarray with defined size
        self.bitarray.setall(0) # Set all values in the bitarray to zero

    # Define generic hash function, which returns the bitarray index
    def get_hash(self, value, hash):
        return int(hash(value.lower().encode()).hexdigest(), 16) % self.size
    
    # Add a value to the Bloom filter
    def add(self, value):
        for hash in self.hashes: # Apply value through selected hash functions
            index = self.get_hash(value, hash) # Get hashed value to bitarray index
            self.bitarray[index] = 1 # Set corresponding position in bitarray to 1 (occupied)

    # Check if a value is in the Bloom filter (may return false positives)
    def lookup(self, value):
        for hash in self.hashes:
            index = self.get_hash(value, hash)
            if self.bitarray[index] == 0: # If index position in bitarray is empty, the word is not there
                return False
        return True

In [37]:
# Test the Bloom filter
hi = BloomFilter(size = 100, hashes = [sha256])
hi.add("Hello!")
print(hi.lookup("Hello!"))   # True
print(hi.lookup("Goodbye!")) # False

bruh = BloomFilter(size = 10_000, hashes = [sha256, blake2b, sha3_256])
print(bruh.lookup("bruh")) # False
bruh.add("bruh")
print(bruh.lookup("bruh")) # True

True
False
False
True


In [38]:
# Store words.txt (~500,000 words, assuming correct spelling) into a Bloom filter
# Note: To avoid hash collisions, the absolute minimum for a corpus of 500,000 words with 3 hash functions is 150,000 bits.
# For a starting point, at least 1,000,000 bits is recommended; 10,000,000 bits to be safe.
corpus = BloomFilter(size = 100_000_000, hashes = [sha256, blake2b, sha3_256])

with open("words.txt") as file:
    for line in file:
        word = line.strip()
        corpus.add(word)

# Test the primed Bloom filter
print(corpus.lookup("cb&b634"))       # False
print(corpus.lookup("computational")) # True
print(corpus.lookup("methods"))       # True
print(corpus.lookup("for"))           # True
print(corpus.lookup("informatics"))   # True
print(corpus.lookup("robert"))        # True
print(corpus.lookup("mcdougal"))      # True (huh?)

False
True
True
True
True
True
True


In [43]:
# Create a spell checker using the Bloom filter class
# Note: May return more false positives (e.g. suggesting a word not in the list) with smaller filters and fewer hash functions.
def spell_checker(word, BloomFilter):
    # If the word is in the specified Bloom filter, it may be correctly spelled
    if BloomFilter.lookup(word):
        return(f"'{word}' may be spelled correctly.")
    
    # If not, generate all possible single-letter substitutions and test them against Bloom filter
    word_suggestions = []
    for char in range(len(word)):
        for letter in "abcdefghijklmnopqrstuvwxyz":
            candidate = word[:char] + letter + word[char+1:] # Slice word to test every letter at every character position
            if BloomFilter.lookup(candidate): # Suggest candidate if it is in the Bloom filter
                word_suggestions.append(candidate)

    # Return candidate words
    return word_suggestions

print(spell_checker("bruh", corpus))   # ['pruh', 'bluh', 'brum', 'brut'] (non-deterministic for some reason? shouldn't a hash be deterministic?)
print(spell_checker("moment", corpus)) # 'moment' may be spelled correctly.

['pruh', 'bluh', 'brum', 'brut']
'moment' may be spelled correctly.


In [47]:
# Test spell checker on typos.json
import json

def accuracy_checker(BloomFilter):
    # Load list of typos
    with open("typos.json") as file:
        typos = json.load(file) # Creates list of [typed_word, correct_word] pairs

    # Check if correct word is produced by the specified Bloom filter and that it gives no more than 3 suggestions
    correct_count = 0
    for typo in typos:
        suggestions = spell_checker(typo[0], BloomFilter)
        if (typo[1] in suggestions) and (len(suggestions) <= 3):
            correct_count += 1

    # Return spell checker accuracy
    return(correct_count / len(typos))

In [None]:
# Test spell checker accuracy
corpus = BloomFilter(size = 1_000_000_000, hashes = [sha256, blake2b, sha3_256])
with open("words.txt") as file:
    for line in file:
        word = line.strip()
        corpus.add(word)

print("Suggestion accuracy:", round(accuracy_checker(corpus) * 100, 3), "%")

Suggestion accuracy: 47.404 %


In [14]:
# Plot filter size against hash function choice (first, first two, all three)
import numpy as np
import pandas as pd

sizes = list(np.linspace(start = 10_000, stop = 100_000_000, num = 10, dtype = int)) # Generate evenly spaced Bloom filter sizes

accuracies = []
for size in sizes:
    one_hash_filter = BloomFilter(size = size, hashes = [sha256])
    accuracy = accuracy_checker(one_hash_filter)
    accuracies.append(accuracy)

one_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})

accuracies = []
for size in sizes:
    two_hash_filter = BloomFilter(size = size, hashes = [sha256, blake2b])
    accuracy = accuracy_checker(two_hash_filter)
    accuracies.append(accuracy)

two_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})

accuracies = []
for size in sizes:
    three_hash_filter = BloomFilter(size = size, hashes = [sha256, blake2b, sha3_256])
    accuracy = accuracy_checker(three_hash_filter)
    accuracies.append(accuracy)

three_hash_results = pd.DataFrame({"filter_size": sizes, "accuracy": accuracies})

one_hash_results
two_hash_results
three_hash_results

# accuracy percentage on y-axis
# size on x-axis
# three lines for each hash choice


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/kevin/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/rn/tstwsd3n5fs4ynm3vrms9khw0000gn/T/ipykernel_29398/262557583.py", line 10, in <module>
    accuracy = accuracy_checker(one_hash_filter)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/rn/tstwsd3n5fs4ynm3vrms9khw0000gn/T/ipykernel_29398/3353725917.py", line 12, in accuracy_checker
    if (typo[1] in spell_checker(typo[0], BloomFilter)) and (len(spell_checker(typo[0], BloomFilter)) <= 3):
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/rn/tstwsd3n5fs4ynm3vrms9khw0000gn/T/ipykernel_29398/2038068952.py", line -1, in spell_checker
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/kevin/Library/Python/3.11/lib/python/site-pac

In [None]:
# plot = sns.lineplot(x = tree_sizes, y = setup_times, label = "Tree setup time")
# plot.set(xscale = "log", yscale = "log")
# sns.lineplot(x = tree_sizes, y = o_n, label = "O(n)", ax = plot)
# sns.lineplot(x = tree_sizes, y = o_n_squared, label = "O(n^2)", ax = plot)
# plot.set_xlabel("n (tree size)")
# plot.set_ylabel("log(time)")
# plot.set_title("Time Complexity Plot of Tree Instantiation Performance")