# SDSC3001 - Course Project

In [2]:
import random
import math
import numpy as np
from sklearn.metrics import jaccard_score

In [3]:
def hash_to_uniform(value, seed=42):
    random.seed(value + seed)
    return random.random()  # Returns a uniform value in (0,1)

In [4]:
def compute_log_rank(value, seed=0):
    h_value = hash_to_uniform(value, seed)
    return math.floor(-math.log2(h_value))


In [5]:
class MaxLogHashSketch:
    def __init__(self, k, max_bits=6):
        self.k = k  # Number of hash functions/registers
        self.max_bits = max_bits
        self.mu = [0] * k  # Stores the maximum log-rank for each register
        self.s = [1] * k  # Stores the uniqueness indicator for each register

    def update(self, value):
        for i in range(self.k):
            log_rank = compute_log_rank(value, seed=i)
            if log_rank > self.mu[i]:
                self.mu[i] = log_rank
                self.s[i] = 1  # New max log-rank, so set indicator to 1
            elif log_rank == self.mu[i]:
                self.s[i] = 0  # Duplicate max log-rank, set indicator to 0


In [6]:
def estimate_jaccard_similarity(sketch1, sketch2):
    k = sketch1.k
    matching_registers = 0

    for i in range(k):
        if sketch1.mu[i] != sketch2.mu[i]:
            continue  # Different max log-ranks, not a match
        if sketch1.s[i] == 1 and sketch2.s[i] == 1:
            matching_registers += 1  # Match found for both mu and s

    return matching_registers / k  # Jaccard similarity estimate


In [7]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection / union

In [None]:
# Example sets
set_a: set[int] = {1, 2, 3, 4, 5, 6, 7, 8}
set_b: set[int] = {5, 6, 7, 8, 9, 10, 11, 12}

In [9]:
# Generate sketches for each set
sketch_a = MaxLogHashSketch(k=128)
sketch_b = MaxLogHashSketch(k=128)

for element in set_a:
    sketch_a.update(element)

for element in set_b:
    sketch_b.update(element)

# Estimate Jaccard similarity
jaccard_estimated = estimate_jaccard_similarity(sketch_a, sketch_b)
print(f"Estimated Jaccard Similarity: {jaccard_estimated}")

Estimated Jaccard Similarity: 0.1640625


In [10]:
jaccard_true = jaccard_similarity(set_a, set_b)
print(f"Actual Jaccard Similarity: {jaccard_true}")

Actual Jaccard Similarity: 0.3333333333333333
