In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_from_disk

# Paths to datasets
normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.dataset"
non_normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.dataset"

# Load datasets
normalized_data = load_from_disk(normalized_path)
non_normalized_data = load_from_disk(non_normalized_path)

# Ensure same number of rows
assert len(normalized_data) == len(non_normalized_data), "Datasets have different lengths!"

# Compare tokenized sequences
similarity_scores = []  # Store similarity percentage per row
jaccard_scores = []     # Store Jaccard similarity per row

for i in range(len(normalized_data)):
    tokens_norm = normalized_data[i]["input_ids"]
    tokens_non_norm = non_normalized_data[i]["input_ids"]
    
    # Ensure same length
    min_len = min(len(tokens_norm), len(tokens_non_norm))
    tokens_norm = tokens_norm[:min_len]
    tokens_non_norm = tokens_non_norm[:min_len]
    
    # Compute token matching percentage
    match_percentage = sum(np.array(tokens_norm) == np.array(tokens_non_norm)) / min_len
    similarity_scores.append(match_percentage)

    # Compute Jaccard similarity (set-based comparison)
    jaccard = len(set(tokens_norm) & set(tokens_non_norm)) / len(set(tokens_norm) | set(tokens_non_norm))
    jaccard_scores.append(jaccard)

# Print summary statistics
print(f"Average Token Match Percentage: {np.mean(similarity_scores) * 100:.2f}%")
print(f"Average Jaccard Similarity: {np.mean(jaccard_scores) * 100:.2f}%")

Average Token Match Percentage: 3.04%
Average Jaccard Similarity: 35.83%


In [2]:
import pandas as pd
from datasets import load_from_disk

# Paths to datasets
normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.dataset"
non_normalized_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.dataset"

# Load datasets
normalized_data = load_from_disk(normalized_path)
non_normalized_data = load_from_disk(non_normalized_path)

# Convert to DataFrames
df_normalized = pd.DataFrame(normalized_data)
df_non_normalized = pd.DataFrame(non_normalized_data)

# Extract first row's tokens as a list
first_row_norm = df_normalized.iloc[0]["input_ids"][:10]  # First 10 tokens
first_row_non_norm = df_non_normalized.iloc[0]["input_ids"][:10]  # First 10 tokens

# Print results
print("🔹 First 10 Input IDs (Normalized):", first_row_norm)
print("🔹 First 10 Input IDs (Non-Normalized):", first_row_non_norm)

# (Optional) Display first few rows of DataFrames for inspection
print("\n📌 First few rows of Normalized Dataset:\n", df_normalized.head())
print("\n📌 First few rows of Non-Normalized Dataset:\n", df_non_normalized.head())


🔹 First 10 Input IDs (Normalized): [2821, 1289, 612, 3670, 87, 2976, 4704, 5901, 2058, 3236]
🔹 First 10 Input IDs (Non-Normalized): [2132, 1408, 5326, 4025, 1131, 612, 5323, 3419, 6383, 2058]

📌 First few rows of Normalized Dataset:
                                            input_ids  length
0  [2821, 1289, 612, 3670, 87, 2976, 4704, 5901, ...     512
1  [4580, 6708, 5639, 2013, 6383, 4025, 2786, 103...     512
2  [6708, 4580, 2132, 2013, 4462, 5639, 2786, 590...     512
3  [6708, 4580, 3453, 2013, 2487, 5639, 2786, 446...     512
4  [4580, 6708, 1289, 2786, 2487, 5300, 2013, 154...     512

📌 First few rows of Non-Normalized Dataset:
                                            input_ids  length
0  [2132, 1408, 5326, 4025, 1131, 612, 5323, 3419...     512
1  [2132, 1408, 5326, 4025, 1131, 612, 5323, 3419...     512
2  [2132, 5326, 1408, 4025, 1131, 612, 5323, 3419...     512
3  [2132, 1408, 5326, 4025, 1131, 612, 5323, 3419...     512
4  [2132, 1408, 5326, 4025, 1131, 5323, 3419, 612