# SDSC3001 - Course Project

## Jaccard similarity coefficient

In [1]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)


# Example usage
set1 = {"a", "b", "c", "d"}
set2 = {"c", "d", "e", "f"}

similarity = jaccard_similarity(set1, set2)
print(f"Jaccard Similarity: {similarity}")

Jaccard Similarity: 0.3333333333333333


## Sketching techniques for the Jaccard similarity coefficient

In [2]:
import math
import mmh3
import numpy as np
import polars as pl
import random


random_seed = 42


def generate_synthetic_stream(cardinality, jaccard_true):
    total_num = cardinality * 2
    sim = (2 * jaccard_true) / (1 + jaccard_true)
    the_same_index = total_num / 2 * sim
    setA_uni_index = total_num / 2 * 1
    setB_uni_index = total_num / 2 * (2 - sim)

    stream = []
    for num in range(total_num):
        if num <= the_same_index:
            stream.append(["setA", num])
            stream.append(["setB", num])
        elif num <= setA_uni_index:
            stream.append(["setA", num])
        elif num <= setB_uni_index:
            stream.append(["setB", num])
        else:
            break
    return stream

### MinHash

In [3]:
class MinHash:
    def __init__(self, k, random_seed=random_seed):
        """
        Initialize MinHash
        k: number of hash functions
        random_seed: random seed for reproducibility
        """
        self.k = k
        self.seed = random_seed
        self.totalShingles = (1 << 32) - 1
        self.minHashSignatures = {}  # Store signatures for each set
        self.randomNoA = self._hash_parameter()
        self.randomNoB = self._hash_parameter()

    def _hash_parameter(self):
        """Generate random hash parameters"""
        randList = []
        k_temp = self.k
        randIndex = random.randint(0, self.totalShingles - 1)
        randList.append(randIndex)
        while k_temp > 0:
            while randIndex in randList:
                randIndex = random.randint(0, self.totalShingles - 1)
            randList.append(randIndex)
            k_temp -= 1
        return randList

    def process_stream(self, stream):
        """
        Process streaming data
        stream: list of [set_id, element] pairs
        """
        for item in stream:
            set_id, element = item[0], item[1]

            # Initialize signature if not exists
            if set_id not in self.minHashSignatures:
                self.minHashSignatures[set_id] = [float("inf")] * self.k

            # Update minimum hash values
            for i in range(self.k):
                hash_value = (
                    self.randomNoA[i] * mmh3.hash(str(element), self.seed) + self.randomNoB[i]
                ) % self.totalShingles
                self.minHashSignatures[set_id][i] = min(self.minHashSignatures[set_id][i], hash_value)

    def estimate_similarity(self, setA="setA", setB="setB"):
        """
        Estimate Jaccard similarity between two sets
        setA, setB: identifiers of the sets to compare
        """
        if setA not in self.minHashSignatures or setB not in self.minHashSignatures:
            raise ValueError("Sets not found in signatures")

        # Count matching signatures
        matches = sum(1 for i in range(self.k) if self.minHashSignatures[setA][i] == self.minHashSignatures[setB][i])

        # Estimate Jaccard similarity
        return matches / self.k

In [4]:
if __name__ == "__main__":
    # Parameters
    k = 128
    cardinality = 10000
    jaccard_true = 0.9

    # Generate synthetic stream
    stream = generate_synthetic_stream(cardinality, jaccard_true)

    # Create MinHash instance and process stream
    minhash = MinHash(k)
    minhash.process_stream(stream)

    # Estimate Jaccard similarity
    jaccard_est = minhash.estimate_similarity()
    print(f"True Jaccard: {jaccard_true}, Estimated Jaccard: {jaccard_est}")

True Jaccard: 0.9, Estimated Jaccard: 0.90625


### b-bit MinHash

In [5]:
class BbitMinHash:
    def __init__(self, k, b, random_seed=random_seed):
        """
        Initialize b-bit MinHash
        k: number of hash functions
        b: number of bits to keep from each hash value
        random_seed: random seed for reproducibility
        """
        self.k = k
        self.b = b
        self.seed = random_seed
        self.totalShingles = (1 << 32) - 1
        self.minHashSignatures = {}  # Store original MinHash signatures
        self.bbitSignatures = {}  # Store b-bit signatures
        self.randomNoA = self._hash_parameter()
        self.randomNoB = self._hash_parameter()
        self.mask = (1 << b) - 1  # Mask for getting lowest b bits

    def _hash_parameter(self):
        """Generate random hash parameters"""
        randList = []
        k_temp = self.k
        randIndex = random.randint(0, self.totalShingles - 1)
        randList.append(randIndex)
        while k_temp > 0:
            while randIndex in randList:
                randIndex = random.randint(0, self.totalShingles - 1)
            randList.append(randIndex)
            k_temp -= 1
        return randList

    def _get_lowest_b_bits(self, value):
        """Extract lowest b bits from a value"""
        return value & self.mask

    def process_stream(self, stream):
        """
        Process streaming data
        stream: list of [set_id, element] pairs
        """
        # First compute regular MinHash signatures
        for item in stream:
            set_id, element = item[0], item[1]

            # Initialize signature if not exists
            if set_id not in self.minHashSignatures:
                self.minHashSignatures[set_id] = [float("inf")] * self.k

            # Update minimum hash values
            for i in range(self.k):
                hash_value = (
                    self.randomNoA[i] * mmh3.hash(str(element), self.seed) + self.randomNoB[i]
                ) % self.totalShingles
                self.minHashSignatures[set_id][i] = min(self.minHashSignatures[set_id][i], hash_value)

        # Convert MinHash signatures to b-bit signatures
        for set_id in self.minHashSignatures:
            self.bbitSignatures[set_id] = [
                self._get_lowest_b_bits(int(value)) for value in self.minHashSignatures[set_id]
            ]

    def estimate_similarity(self, setA="setA", setB="setB"):
        """
        Estimate Jaccard similarity between two sets using b-bit MinHash
        setA, setB: identifiers of the sets to compare
        """
        if setA not in self.bbitSignatures or setB not in self.bbitSignatures:
            raise ValueError("Sets not found in signatures")

        # Count matching b-bit signatures
        matches = sum(1 for i in range(self.k) if self.bbitSignatures[setA][i] == self.bbitSignatures[setB][i])

        # Estimate Jaccard similarity using b-bit MinHash formula
        # Formula: (matches/k - 1/2^b)/(1 - 1/2^b)
        denominator = 1.0 - 1.0 / (1 << self.b)
        numerator = matches / float(self.k) - 1.0 / (1 << self.b)

        return numerator / denominator

In [6]:
if __name__ == "__main__":
    # Parameters
    k = 128
    b = 1  # number of bits to keep
    cardinality = 10000
    jaccard_true = 0.9

    # Generate synthetic stream
    stream = generate_synthetic_stream(cardinality, jaccard_true)

    # Create b-bit MinHash instance and process stream
    bbit_minhash = BbitMinHash(k, b)
    bbit_minhash.process_stream(stream)

    # Estimate Jaccard similarity
    jaccard_est = bbit_minhash.estimate_similarity()
    print(f"True Jaccard: {jaccard_true}, Estimated Jaccard: {jaccard_est}")

True Jaccard: 0.9, Estimated Jaccard: 0.890625


### Odd Sketch

In [7]:
class OddSketch:
    def __init__(self, k, z, random_seed=random_seed):
        """
        Initialize Odd Sketch
        k: number of hash functions (for MinHash)
        z: number of bits in Odd Sketch
        random_seed: random seed for reproducibility
        """
        self.k = k
        self.z = z
        self.seed = random_seed
        self.totalShingles = (1 << 32) - 1
        self.minHashSignatures = {}  # Store MinHash signatures
        self.oddSketches = {}  # Store Odd Sketches
        self.randomNoA = self._hash_parameter()
        self.randomNoB = self._hash_parameter()

    def _hash_parameter(self):
        """Generate random hash parameters"""
        randList = []
        k_temp = self.k
        randIndex = random.randint(0, self.totalShingles - 1)
        randList.append(randIndex)
        while k_temp > 0:
            while randIndex in randList:
                randIndex = random.randint(0, self.totalShingles - 1)
            randList.append(randIndex)
            k_temp -= 1
        return randList

    def _compute_odd_sketch(self, minhash_signature):
        """
        Compute Odd Sketch from MinHash signature
        Uses XOR-based sketching
        """
        odd_sketch = np.zeros(self.z, dtype=bool)

        for i in range(self.k):
            # Hash (i, minhash_value) to position in odd sketch
            position = mmh3.hash(str((i, minhash_signature[i])), self.seed) % self.z
            odd_sketch[position] ^= True  # XOR operation

        return odd_sketch

    def process_stream(self, stream):
        """
        Process streaming data
        stream: list of [set_id, element] pairs
        """
        # First compute regular MinHash signatures
        for item in stream:
            set_id, element = item[0], item[1]

            # Initialize signature if not exists
            if set_id not in self.minHashSignatures:
                self.minHashSignatures[set_id] = [float("inf")] * self.k

            # Update minimum hash values
            for i in range(self.k):
                hash_value = (
                    self.randomNoA[i] * mmh3.hash(str(element), self.seed) + self.randomNoB[i]
                ) % self.totalShingles
                self.minHashSignatures[set_id][i] = min(self.minHashSignatures[set_id][i], hash_value)

        # Convert MinHash signatures to Odd Sketches
        for set_id in self.minHashSignatures:
            self.oddSketches[set_id] = self._compute_odd_sketch(self.minHashSignatures[set_id])

    def estimate_similarity(self, setA="setA", setB="setB"):
        """
        Estimate Jaccard similarity between two sets using Odd Sketch
        setA, setB: identifiers of the sets to compare
        """
        if setA not in self.oddSketches or setB not in self.oddSketches:
            raise ValueError("Sets not found in sketches")

        # Count differing bits between odd sketches
        hamming_distance = np.sum(self.oddSketches[setA] != self.oddSketches[setB])

        # Estimate Jaccard similarity using Odd Sketch formula
        # J = 1 + (z/4k)ln(1 - 2d/z)
        # where d is the Hamming distance and z is the sketch size
        if hamming_distance == self.z:
            return 0.0

        similarity = 1.0 + (self.z / (4.0 * self.k)) * np.log(1.0 - (2.0 * hamming_distance) / self.z)

        # Clamp similarity to [0,1]
        return max(0.0, min(1.0, similarity))


In [8]:
if __name__ == "__main__":
    # Parameters
    k = 128
    z = 512  # Odd Sketch size
    cardinality = 10000
    jaccard_true = 0.9

    # Generate synthetic stream
    stream = generate_synthetic_stream(cardinality, jaccard_true)

    # Create Odd Sketch instance and process stream
    odd_sketch = OddSketch(k, z)
    odd_sketch.process_stream(stream)

    # Estimate Jaccard similarity
    jaccard_est = odd_sketch.estimate_similarity()
    print(f"True Jaccard: {jaccard_true}, Estimated Jaccard: {jaccard_est}")

True Jaccard: 0.9, Estimated Jaccard: 0.9270932291919122


## MaxLogHash

In [9]:
class MaxLogHash:
    def __init__(self, k, random_seed=random_seed):
        self.k = k
        self.seed = random_seed
        self.totalShingles = (1 << 32) - 1
        self.maxShingleID = {}
        self.randomNoA = self._hash_parameter()
        self.randomNoB = self._hash_parameter()

    def _hash_parameter(self):
        randList = []
        k_temp = self.k
        randIndex = random.randint(0, self.totalShingles - 1)
        randList.append(randIndex)
        while k_temp > 0:
            while randIndex in randList:
                randIndex = random.randint(0, self.totalShingles - 1)
            randList.append(randIndex)
            k_temp -= 1
        return randList

    def process_stream(self, stream):
        for item in stream:
            if item[0] in self.maxShingleID:
                max_hash_val_list = self.maxShingleID[item[0]][0]
                max_hash_sig_list = self.maxShingleID[item[0]][1]

                for x in range(self.k):
                    temp = (
                        self.randomNoA[x] * mmh3.hash(str(item[1]), self.seed) + self.randomNoB[x]
                    ) % self.totalShingles
                    temp = temp / float(self.totalShingles)
                    log_temp = -math.log(temp, 2)
                    hash_val = math.ceil(log_temp)

                    if hash_val > max_hash_val_list[x]:
                        max_hash_val_list[x] = hash_val
                        max_hash_sig_list[x] = 1
                    elif hash_val == max_hash_val_list[x]:
                        max_hash_sig_list[x] = 0

                self.maxShingleID[item[0]][0] = max_hash_val_list
                self.maxShingleID[item[0]][1] = max_hash_sig_list
            else:
                max_hash_val_list = [-1] * self.k
                max_hash_sig_list = [0] * self.k
                self.maxShingleID[item[0]] = [max_hash_val_list, max_hash_sig_list]

    def estimate_similarity(self, setA="setA", setB="setB"):
        con = 0
        for x in range(self.k):
            if self.maxShingleID[setA][0][x] > self.maxShingleID[setB][0][x] and self.maxShingleID[setA][1][x] == 1:
                con += 1
            elif self.maxShingleID[setA][0][x] < self.maxShingleID[setB][0][x] and self.maxShingleID[setB][1][x] == 1:
                con += 1

        jaccard_sim = 1.0 - con * (1 / float(self.k)) * (1 / 0.7213)
        return jaccard_sim

In [10]:
if __name__ == "__main__":
    # Parameters
    k = 128
    cardinality = 10000
    jaccard_true = 0.9

    # Generate synthetic stream
    stream = generate_synthetic_stream(cardinality, jaccard_true)

    # Create MaxLogHash instance and process stream
    maxlog = MaxLogHash(k)
    maxlog.process_stream(stream)

    # Estimate Jaccard similarity
    jaccard_est = maxlog.estimate_similarity()
    print(f"True Jaccard: {jaccard_true}, Estimated Jaccard: {jaccard_est}")

True Jaccard: 0.9, Estimated Jaccard: 0.9458443088867323


In [11]:
def compare_all_methods():
    """
    Compare all similarity estimation methods
    """
    # Parameters
    k = 128  # Number of hash functions
    cardinality = 10000
    jaccard_true = 0.9

    # Generate synthetic stream
    stream = generate_synthetic_stream(cardinality, jaccard_true)

    # Regular MinHash estimation
    minhash = MinHash(k)
    minhash.process_stream(stream)
    jaccard_est_min = minhash.estimate_similarity()

    # b-bit MinHash estimation
    b = 1  # b-bit MinHash parameter
    bbit_minhash = BbitMinHash(k, b)
    bbit_minhash.process_stream(stream)
    jaccard_est_bbit = bbit_minhash.estimate_similarity()

    # Odd Sketch estimation
    z = 512  # Odd Sketch size
    odd_sketch = OddSketch(k, z)
    odd_sketch.process_stream(stream)
    jaccard_est_odd = odd_sketch.estimate_similarity()

    # MaxLogHash estimation
    maxlog = MaxLogHash(k)
    maxlog.process_stream(stream)
    jaccard_est_max = maxlog.estimate_similarity()

    print(f"True Jaccard: {jaccard_true}")
    print(f"MinHash Estimation: {jaccard_est_min:.4f}")
    print(f"b-bit MinHash Estimation: {jaccard_est_bbit:.4f}")
    print(f"Odd Sketch Estimation: {jaccard_est_odd:.4f}")
    print(f"MaxLogHash Estimation: {jaccard_est_max:.4f}")

In [12]:
compare_all_methods()

True Jaccard: 0.9
MinHash Estimation: 0.8828
b-bit MinHash Estimation: 0.9219
Odd Sketch Estimation: 0.9016
MaxLogHash Estimation: 0.8917


## Datasets

### Synthetic datasets

Generate set A by randomly selecting n different numbers from I

Generate set B by randomly selecting $|A \cup B| = \frac{J_{A, B}|A|}{1+J_{A, B}}$ different numbers from set A and $n - |A \cup B|$ different numbers from set I\A

n = 10,000 by default

In [13]:
# Balanced set-pairs (i.e., |A| = |B| = n)

In [14]:
# Unbalanced set-pairs (i.e., |A| != |B|)

### Real-world datasets

In [15]:
def load_dataset(filepath):
    with open(filepath, "r") as file:
        dataset = pl.DataFrame(np.loadtxt(file, dtype=int))

    item_record_pairs = {}

    for record_id, record in enumerate(dataset):
        for item in record:
            if item not in item_record_pairs:
                item_record_pairs[item] = []
            item_record_pairs[item].append(record_id)

    pairs = [(item, rec) for item, recs in item_record_pairs.items() for rec in recs]

    return dataset, item_record_pairs, pairs

#### MUSHROOM dataset

In [16]:
mushroom_dataset_path = "./data/mushroom.dat"
mushroom = load_dataset(mushroom_dataset_path)

mushroom_dataset = mushroom[0]
mushroom_item_record_pairs = mushroom[1]
mushroom_pairs = mushroom[2]

- 8,124 records with 119 distinct items
- 186,852 item-record pairs

In [17]:
print(mushroom_dataset[:5])
print(mushroom_dataset.shape)
print(len(mushroom_item_record_pairs))
print(len(mushroom_pairs))

shape: (5, 23)
┌──────────┬──────────┬──────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_19 ┆ column_20 ┆ column_21 ┆ column_22 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i32      ┆ i32      ┆ i32      ┆ i32      ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
╞══════════╪══════════╪══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ 3        ┆ 9        ┆ 13       ┆ … ┆ 93        ┆ 98        ┆ 107       ┆ 113       │
│ 2        ┆ 3        ┆ 9        ┆ 14       ┆ … ┆ 93        ┆ 99        ┆ 108       ┆ 114       │
│ 2        ┆ 4        ┆ 9        ┆ 15       ┆ … ┆ 93        ┆ 99        ┆ 108       ┆ 115       │
│ 1        ┆ 3        ┆ 10       ┆ 15       ┆ … ┆ 93        ┆ 98        ┆ 107       ┆ 113       │
│ 2        ┆ 3        ┆ 9        ┆ 16       ┆ … ┆ 94        ┆ 99        ┆ 109       ┆ 114       │
└────

#### CONNECT dataset

In [18]:
connect_dataset_path = "./data/connect.dat"
connect = load_dataset(connect_dataset_path)

connect_dataset = connect[0]
connect_item_record_pairs = connect[1]
connect_pairs = connect[2]

- 67,557 records with 127 distinct items
- 2,904,951 item-record pairs

In [19]:
print(connect_dataset[:5])
print(connect_dataset.shape)
print(len(connect_item_record_pairs))
print(len(connect_pairs))

shape: (5, 43)
┌──────────┬──────────┬──────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ … ┆ column_39 ┆ column_40 ┆ column_41 ┆ column_42 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i32      ┆ i32      ┆ i32      ┆ i32      ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
╞══════════╪══════════╪══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ 4        ┆ 7        ┆ 10       ┆ … ┆ 118       ┆ 121       ┆ 124       ┆ 127       │
│ 1        ┆ 4        ┆ 7        ┆ 10       ┆ … ┆ 118       ┆ 121       ┆ 124       ┆ 127       │
│ 1        ┆ 4        ┆ 7        ┆ 10       ┆ … ┆ 118       ┆ 121       ┆ 124       ┆ 127       │
│ 1        ┆ 4        ┆ 7        ┆ 10       ┆ … ┆ 118       ┆ 121       ┆ 124       ┆ 127       │
│ 1        ┆ 5        ┆ 7        ┆ 10       ┆ … ┆ 118       ┆ 121       ┆ 124       ┆ 127       │
└────