### Bulding Geneformer-Like Dataset From Yeast Master Matrix

#### Building .loom file

##### Replacing NaNs by 0s

In [None]:
import pandas as pd

file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv"

# Load CSV
df = pd.read_csv(file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# print(df.shape)

# Replace NaNs with 0
df.fillna(0, inplace=True)

# Save the cleaned CSV
df.to_csv(file_path, sep='\t')

print("NaNs replaced with 0")

##### Translating to .loom

In [None]:
import pandas as pd
import numpy as np
import loompy

input_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv"
output_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom"

# Load CSV
df = pd.read_csv(input_file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# Compute total read counts for each experiment (equivalent to cell in geneformer)
n_counts = df.sum(axis=0).astype(np.float32)  # Sum across genes for each column 

# Prepare row attributes (Gene IDs → Ensembl IDs assumed to be index)
row_attrs = {"ensembl_id": df.index.tolist()}  # Ensure index has Ensembl IDs

# Prepare column attributes (Cells & their total read counts)
col_attrs = {
    # "exp_name": df.columns.tolist(),  # Experiment names
    "n_counts": n_counts.values,   # Total counts per column (experiment)
}

# Convert DataFrame to Loom format & save
loompy.create(output_file_path, df.values.astype(np.float32), row_attrs, col_attrs)

print(f"Loom file saved as: {output_file_path}")

In [None]:
import numpy as np
print(np.__version__)  # Should print something like 1.26.4

# Had to downgrade numpy < 2.0

##### Verifying .loom file

In [None]:
import loompy

input_loom_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom"

with loompy.connect(input_loom_file_path) as ds:
    # Print general metadata
    print("Row attributes:", ds.ra.keys())  # Should contain 'ensembl_id'
    print("Column attributes:", ds.ca.keys())  # Should contain 'n_counts'
    print("Data shape (genes x exp columns):", ds.shape)

    # Print first 5 genes (rows) and their attributes
    print("\nFirst 5 Row Attributes:")
    for key in ds.ra.keys():
        print(f"{key}: {ds.ra[key][:5]}")  # Print first 5 values of each row attribute

    # print(f"exp_name: {ds.ca['exp_name'][:1]}")
    print(f"n_counts: {ds.ca['n_counts'][:5]}")

#### Generating Dictionaries 

##### Tokens Dictionary

###### Load example to see the intuition

In [None]:
import pickle

# Path to the token dictionary file
token_dict_file = "/home/logs/jtorresb/Geneformer/geneformer/token_dictionary_gc95M.pkl"

# Function to inspect the token dictionary
def inspect_token_dictionary(file_path, num_samples=10):
    with open(file_path, "rb") as f:
        token_dict = pickle.load(f)
    
    print(f"Token dictionary type: {type(token_dict)}")
    print(f"Total tokens: {len(token_dict)}")
    print("First 10 token entries:")
    sample_items = list(token_dict.items())[:num_samples]
    for key, value in sample_items:
        print(f"{key}: {value}")

# Run the inspection
inspect_token_dictionary(token_dict_file)

###### Generating Token Dictionary

In [None]:
import pandas as pd
import pickle

# File paths
csv_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd.csv"
output_pkl = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_token_dict.pkl"

# Load CSV (Ensure YORFs are the index)
df = pd.read_csv(csv_file, sep='\t', index_col=0)

# Extract yeast ORFs (YORFs) from index
yorfs = df.index.tolist()

# Optional: Sort alphabetically for consistency
yorfs.sort()

# Initialize token dictionary with special tokens
token_dict = {
    "<pad>": 0,
    "<mask>": 1,
    "<cls>": 2,
    "<eos>": 3,
}

# Assign unique token IDs starting from 4
for i, gene_id in enumerate(yorfs, start=4):
    token_dict[gene_id] = i

# Save dictionary as a pickle file
with open(output_pkl, "wb") as f:
    pickle.dump(token_dict, f)

print(f"Token dictionary saved as: {output_pkl}")

##### Medians Dictionary

###### Inspecting example first

In [None]:
import pickle

# Path to the median dictionary file
median_dict_file = "/home/logs/jtorresb/Geneformer/geneformer/gene_median_dictionary_gc95M.pkl"

# Function to inspect the median dictionary
def inspect_median_dictionary(file_path, num_samples=10):
    with open(file_path, "rb") as f:
        median_dict = pickle.load(f)
    
    print(f"Median dictionary type: {type(median_dict)}")
    print(f"Total genes in dictionary: {len(median_dict)}")
    print("First 10 median entries:")
    sample_items = list(median_dict.items())[:num_samples]
    for key, value in sample_items:
        print(f"{key}: {value}")

# Run the inspection
inspect_median_dictionary(median_dict_file)

In [None]:
import pandas as pd
import numpy as np
import pickle

# File paths
csv_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv" # Copy already replaced NaNs by 0s
output_pkl = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_median_dict.pkl"

# Load CSV (genes as index, experiments as columns)
df = pd.read_csv(csv_file, sep='\t', index_col=0)

# Compute nonzero medians for each gene
median_dict = {}
for gene in df.index:
    nonzero_values = df.loc[gene][df.loc[gene] != 0]  # Ignore zeros
    if not nonzero_values.empty:
        median_dict[gene] = np.median(nonzero_values)  # Compute median
    else:
        median_dict[gene] = 0  # If all values are zero, set median to 0

# Save dictionary as a pickle file
with open(output_pkl, "wb") as f:
    pickle.dump(median_dict, f)

print(f"Median dictionary saved as: {output_pkl}")
