# SDSC3001 - Course Project

## Jaccard similarity coefficient

In [1]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)


# Example usage
set1 = {"a", "b", "c", "d"}
set2 = {"c", "d", "e", "f"}

similarity = jaccard_similarity(set1, set2)
print(f"Jaccard Similarity: {similarity}")

Jaccard Similarity: 0.3333333333333333


## Sketching techniques for the Jaccard similarity coefficient

In [2]:
import numpy as np
import random


random_seed = 42
random.seed(random_seed)
# Parameters
k = 128  # Number of hash functions
n = 10_000  # cardinality of the sets

In [3]:
import sys
import os

# print(os.getcwd() in sys.path)
sys.path.append(os.getcwd())
# print(os.getcwd() in sys.path)

In [4]:
from hashSketch import MinHash, B_bitMinHash, OddSketch, MaxLogHash

## Datasets

### Synthetic datasets

Generate set A by randomly selecting n different numbers from I

Generate set B by randomly selecting $|A \cup B| = \frac{J_{A, B}|A|}{1+J_{A, B}}$ different numbers from set A and $n - |A \cup B|$ different numbers from set I\A

n = 10,000 by default

- Balanced set-pairs (i.e., |A| = |B| = n)
- Unbalanced set-pairs (i.e., |A| != |B|)

In [None]:
# syntheticDataset.py
def compare_all_methods(stream, num_runs, k=128, n=10000): ...

In [None]:
import polars as pl


df_balance_mean = pl.read_parquet("result/balanced_mean_results.parquet")
df_balance_median = pl.read_parquet("result/balanced_median_results.parquet")
df_balance_std = pl.read_parquet("result/balanced_std_results.parquet")
df_balance_rmse = pl.read_parquet("result/balanced_rmse_results.parquet")
df_balance_bias = pl.read_parquet("result/balanced_bias_results.parquet")

df_unbalance_mean = pl.read_parquet("result/unbalance_mean_results.parquest")
df_unbalance_median = pl.read_parquet("result/unbalance_median_results.parquest")
df_unbalance_std = pl.read_parquet("result/unbalance_std_results.parquest")
df_unbalance_rmse = pl.read_parquet("result/unbalance_rmse_results.parquest")
df_unbalance_bias = pl.read_parquet("result/unbalance_bias_results.parquest")

### Real-world datasets

In [5]:
def load_dataset(filepath):
    stream = []
    with open(filepath, "r") as file:
        for record_id, line in enumerate(file):
            items = [int(x) for x in line.strip().split()]
            for item in items:
                stream.append([record_id, item])
    #     dataset = pl.DataFrame(np.loadtxt(file, dtype=int))

    # item_record_pairs = {}
    # for record_id, record in enumerate(dataset):
    #     for item in record:
    #         if item not in item_record_pairs:
    #             item_record_pairs[item] = []
    #         item_record_pairs[item].append(record_id)
    # pairs = [(item, rec) for item, recs in item_record_pairs.items() for rec in recs]
    # print(dataset.head(5))
    # print(f"{dataset.shape[0]} records with {len(item_record_pairs)} distinct times")
    # print(f"{len(pairs)} item-record pairs")

    return stream

#### MUSHROOM dataset

- 8,124 records with 119 distinct items
- 186,852 item-record pairs

In [6]:
mushroom_dataset_path = "./data/mushroom.dat"
mushroom_stream = load_dataset(mushroom_dataset_path)

# mushroom_dataset = mushroom[0]
# mushroom_item_record_pairs = mushroom[1]
# mushroom_pairs = mushroom[2]

# print(mushroom_dataset[:5])
# print(mushroom_dataset.shape)
# print(len(mushroom_item_record_pairs))
# print(len(mushroom_pairs))

In [7]:
minhash = MinHash(k)
bbit_minhash = B_bitMinHash(k, b=4)
odd_sketch = OddSketch(k, z=4 * k)
maxlog = MaxLogHash(k)

In [None]:
# MinHash estimation
minhash.process_stream(mushroom_stream)
print(minhash.estimate_similarity(0, 1))

# b-bit MinHash estimation
bbit_minhash.process_stream(mushroom_stream)
print(bbit_minhash.estimate_similarity(0, 1))

# Odd Sketch estimation
odd_sketch.process_stream(mushroom_stream)
print(odd_sketch.estimate_similarity(0, 1))

# MaxLogHash estimation
maxlog.process_stream(mushroom_stream)
print(maxlog.estimate_similarity(0, 1))

#### CONNECT dataset

- 67,557 records with 127 distinct items
- 2,904,951 item-record pairs

In [None]:
connect_dataset_path = "./data/connect.dat"
connect_stream = load_dataset(connect_dataset_path)

# connect_dataset = connect[0]
# connect_item_record_pairs = connect[1]
# connect_pairs = connect[2]

# print(connect_dataset[:5])
# print(connect_dataset.shape)
# print(len(connect_item_record_pairs))
# print(len(connect_pairs))