In [4]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_from_disk

# Paths to datasets
normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.dataset"
non_normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.dataset"

# Load datasets
normalized_data = load_from_disk(normalized_path)
non_normalized_data = load_from_disk(non_normalized_path)

# Ensure same number of rows
assert len(normalized_data) == len(non_normalized_data), "Datasets have different lengths!"

# Compare tokenized sequences
similarity_scores = []  # Store similarity percentage per row
jaccard_scores = []     # Store Jaccard similarity per row

for i in range(len(normalized_data)):
    tokens_norm = normalized_data[i]["input_ids"]
    tokens_non_norm = non_normalized_data[i]["input_ids"]
    
    # Ensure same length
    min_len = min(len(tokens_norm), len(tokens_non_norm))
    tokens_norm = tokens_norm[:min_len]
    tokens_non_norm = tokens_non_norm[:min_len]
    
    # Compute token matching percentage
    match_percentage = sum(np.array(tokens_norm) == np.array(tokens_non_norm)) / min_len
    similarity_scores.append(match_percentage)

    # Compute Jaccard similarity (set-based comparison)
    jaccard = len(set(tokens_norm) & set(tokens_non_norm)) / len(set(tokens_norm) | set(tokens_non_norm))
    jaccard_scores.append(jaccard)

# Print summary statistics
print(f"Average Token Match Percentage: {np.mean(similarity_scores) * 100:.2f}%")
print(f"Average Jaccard Similarity: {np.mean(jaccard_scores) * 100:.2f}%")

Average Token Match Percentage: 3.04%
Average Jaccard Similarity: 35.83%
