# SDSC3001 - Course Project

## Jaccard similarity coefficient

In [None]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)


# Example usage
set1 = {"a", "b", "c", "d"}
set2 = {"c", "d", "e", "f"}

similarity = jaccard_similarity(set1, set2)
print(f"Jaccard Similarity: {similarity}")

## Sketching techniques for the Jaccard similarity coefficient

In [None]:
import numpy as np
import random


random_seed = 42
random.seed(random_seed)
# Parameters
k = 128  # Number of hash functions
n = 10_000  # cardinality of the sets

In [None]:
def generate_balanced_synthetic_stream(n, jaccard_true):
    total_num = n * 2
    similarity_coefficient = (2 * jaccard_true) / (1 + jaccard_true)
    the_same_index = total_num / 2 * similarity_coefficient
    setA_uni_index = total_num / 2 * 1
    setB_uni_index = total_num / 2 * (2 - similarity_coefficient)

    stream = []
    for num in range(total_num):
        if num <= the_same_index:
            stream.append(["setA", num])
            stream.append(["setB", num])
        elif num <= setA_uni_index:
            stream.append(["setA", num])
        elif num <= setB_uni_index:
            stream.append(["setB", num])
        else:
            break
    return stream

In [None]:
def generate_unbalanced_synthetic_stream(n, jaccard_true): ...

In [None]:
import sys
import os

print(os.getcwd() in sys.path)
sys.path.append(os.getcwd())
print(os.getcwd() in sys.path)

In [None]:
from hashSketch import MinHash, B_bitMinHash, OddSketch, MaxLogHash

### MinHash

### b-bit MinHash

### Odd Sketch

## MaxLogHash

## Datasets

### Synthetic datasets

Generate set A by randomly selecting n different numbers from I

Generate set B by randomly selecting $|A \cup B| = \frac{J_{A, B}|A|}{1+J_{A, B}}$ different numbers from set A and $n - |A \cup B|$ different numbers from set I\A

n = 10,000 by default

- Balanced set-pairs (i.e., |A| = |B| = n)
- Unbalanced set-pairs (i.e., |A| != |B|)

In [None]:
def compare_all_methods(jaccard_true, k=128, n=10000):
    """
    Compare all similarity estimation methods
    jaccard_true: true Jaccard similarity
    k: number of hash functions
    n: number of elements in the stream (cardinality)
    """

    # Generate synthetic stream
    stream = generate_balanced_synthetic_stream(n, jaccard_true)
    # print(stream[:10])

    # Regular MinHash estimation
    minhash = MinHash(k)
    minhash.process_stream(stream)
    jaccard_est_min = minhash.estimate_similarity()

    # b-bit MinHash estimation
    b = 1  # b-bit MinHash parameter
    bbit_minhash = B_bitMinHash(k, b)
    bbit_minhash.process_stream(stream)
    jaccard_est_bbit = bbit_minhash.estimate_similarity()

    # Odd Sketch estimation
    z = 512  # Odd Sketch size
    odd_sketch = OddSketch(k, z)
    odd_sketch.process_stream(stream)
    jaccard_est_odd = odd_sketch.estimate_similarity()

    # MaxLogHash estimation
    maxlog = MaxLogHash(k)
    maxlog.process_stream(stream)
    jaccard_est_max = maxlog.estimate_similarity()

    # print(f"{jaccard_true}, {jaccard_est_min}, {jaccard_est_bbit}, {jaccard_est_odd}, {jaccard_est_max}")
    return jaccard_est_min, jaccard_est_bbit, jaccard_est_odd, jaccard_est_max

In [None]:
compare_all_methods(0.8)

In [None]:
# Initialize an empty list to store results
results = []

for i in range(80, 101):
    jaccard_true = i / 100
    jaccard_est_min, jaccard_est_bbit, jaccard_est_odd, jaccard_est_max = compare_all_methods(jaccard_true)

    # Append the results to the list
    results.append([jaccard_true, jaccard_est_min, jaccard_est_bbit, jaccard_est_odd, jaccard_est_max])

In [None]:
import polars as pl


# Create a DataFrame from the results
columns = ["True Jaccard", "MinHash", "b-bit MinHash", "Odd Sketch", "MaxLogHash"]
df = pl.DataFrame(results, schema=columns, orient="row")

# Display the DataFrame
print(df)

In [None]:
import matplotlib.pyplot as plt

# Set the size of the plot
plt.figure(figsize=(10, 8))

# Plot the data
for method in columns[1:]:
    plt.plot(df["True Jaccard"], df[method], label=method)

# Set the title and labels
plt.title("Jaccard Similarity Estimates by Different Methods")
plt.xlabel("True Jaccard Index")
plt.ylabel("Estimated Jaccard Index")

# Add a legend
plt.legend()

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calculate absolute errors
df = df.with_columns(
    [
        pl.col("MinHash").sub(pl.col("True Jaccard")).abs().alias("MinHash_error"),
        pl.col("b-bit MinHash").sub(pl.col("True Jaccard")).abs().alias("bbit_error"),
        pl.col("Odd Sketch").sub(pl.col("True Jaccard")).abs().alias("oddsketch_error"),
        pl.col("MaxLogHash").sub(pl.col("True Jaccard")).abs().alias("maxlog_error"),
    ]
)

# Create the plot
plt.figure(figsize=(10, 6))

# Plot each method's error
plt.plot(df["True Jaccard"], df["MinHash_error"], "o-", label="MinHash", color="blue")
plt.plot(df["True Jaccard"], df["bbit_error"], "s-", label="b-bit MinHash", color="red")
plt.plot(df["True Jaccard"], df["oddsketch_error"], "^-", label="Odd Sketch", color="green")
plt.plot(df["True Jaccard"], df["maxlog_error"], "D-", label="MaxLogHash", color="purple")

# Customize the plot
plt.xlabel("True Jaccard Similarity")
plt.ylabel("Absolute Error")
plt.title("Absolute Error vs True Jaccard Similarity")
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()

# Set x-axis limits
plt.xlim(0.78, 1.02)

# Format y-axis to show small values clearly
plt.yscale("log")  # Use log scale for better visualization of small errors
plt.ylim(0.001, 0.1)

# Add horizontal lines for reference
plt.axhline(y=0.01, color="gray", linestyle="--", alpha=0.3)
plt.axhline(y=0.05, color="gray", linestyle="--", alpha=0.3)

# Adjust layout to prevent label cutoff
plt.tight_layout()

plt.show()

### Real-world datasets

In [None]:
def load_dataset(filepath):
    with open(filepath, "r") as file:
        dataset = pl.DataFrame(np.loadtxt(file, dtype=int))

    item_record_pairs = {}

    for record_id, record in enumerate(dataset):
        for item in record:
            if item not in item_record_pairs:
                item_record_pairs[item] = []
            item_record_pairs[item].append(record_id)

    pairs = [(item, rec) for item, recs in item_record_pairs.items() for rec in recs]

    return dataset, item_record_pairs, pairs

#### MUSHROOM dataset

In [None]:
mushroom_dataset_path = "./data/mushroom.dat"
mushroom = load_dataset(mushroom_dataset_path)

mushroom_dataset = mushroom[0]
mushroom_item_record_pairs = mushroom[1]
mushroom_pairs = mushroom[2]

- 8,124 records with 119 distinct items
- 186,852 item-record pairs

In [None]:
print(mushroom_dataset[:5])
print(mushroom_dataset.shape)
print(len(mushroom_item_record_pairs))
print(len(mushroom_pairs))

In [None]:
# def create_stream_from_dataset(dataset):
#     """
#     Convert dataset into a stream of [item_id, record_id] pairs
#     dataset: DataFrame where each row is a record containing items

#     Returns:
#     stream: list of [item_id, record_id] pairs
#     """
#     stream = []

#     # Iterate through each record
#     for record_id, record in enumerate(dataset.iter_rows()):
#         # For each item in the record
#         for item in record:
#             if item != 0:  # Assuming 0 is not a valid item ID
#                 # Add (item, record) pair to stream
#                 stream.append([f"set{record_id}", item])
#                 # print(record_id, item)

#     return stream


# def analyze_stream(stream):
#     """
#     Print statistics about the stream
#     """
#     unique_items = len(set(pair[0] for pair in stream))
#     unique_records = len(set(pair[1] for pair in stream))
#     total_pairs = len(stream)

#     print(f"Number of unique items: {unique_items}")
#     print(f"Number of records: {unique_records}")
#     print(f"Total item-record pairs: {total_pairs}")

# # To estimate similarity between two items
# def get_item_similarity(minhash, item1, item2):
#     return minhash.estimate_similarity(f"item_{item1}", f"item_{item2}")

In [None]:
# # Convert mushroom_dataset to a stream of [set_id, element] pairs
# stream = []

# for record_id, record in enumerate(mushroom_dataset):
#     for item in record:
#         stream.append([f"set{record_id}", item])

# # Now you can pass the stream to the sketch methods
# # Example usage with MinHash
# minhash = MinHash(k=128)
# minhash.process_stream(stream)
# jaccard_est_min = minhash.estimate_similarity(setA="set0", setB="set1")

# # Example usage with MaxLogHash
# maxlog = MaxLogHash(k=128)
# maxlog.process_stream(stream)
# jaccard_est_max = maxlog.estimate_similarity(setA="set0", setB="set1")

# print(f"MinHash Jaccard Estimate: {jaccard_est_min}")
# print(f"MaxLogHash Jaccard Estimate: {jaccard_est_max}")

#### CONNECT dataset

In [None]:
connect_dataset_path = "./data/connect.dat"
connect = load_dataset(connect_dataset_path)

connect_dataset = connect[0]
connect_item_record_pairs = connect[1]
connect_pairs = connect[2]

- 67,557 records with 127 distinct items
- 2,904,951 item-record pairs

In [None]:
print(connect_dataset[:5])
print(connect_dataset.shape)
print(len(connect_item_record_pairs))
print(len(connect_pairs))