### Bulding Geneformer-Like Dataset From Yeast Master Matrix

#### Building .loom file

##### Replacing NaNs by 0s

In [1]:
import pandas as pd

file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd_copy.csv"

# Load CSV
df = pd.read_csv(file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# print(df.shape)

# Replace NaNs with 0
df.fillna(0, inplace=True)

# Save the cleaned CSV
df.to_csv(file_path, sep='\t')

print("NaNs replaced with 0")

NaNs replaced with 0


##### Translating to .loom

In [2]:
import pandas as pd
import numpy as np
import loompy

input_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd_copy.csv"
output_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.loom"

# Load CSV
df = pd.read_csv(input_file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# Compute total read counts for each experiment (equivalent to cell in geneformer)
n_counts = df.sum(axis=0).astype(np.float32)  # Sum across genes for each column 

# Prepare row attributes (Gene IDs → Ensembl IDs assumed to be index)
row_attrs = {"ensembl_id": df.index.tolist()}  # Ensure index has Ensembl IDs

# Prepare column attributes (Cells & their total read counts)
col_attrs = {
    # "exp_name": df.columns.tolist(),  # Experiment names
    "n_counts": n_counts.values,   # Total counts per column (experiment)
}

# Convert DataFrame to Loom format & save
loompy.create(output_file_path, df.values.astype(np.float32), row_attrs, col_attrs)

print(f"Loom file saved as: {output_file_path}")

Loom file saved as: /home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.loom


In [3]:
import numpy as np
print(np.__version__)  # Should print something like 1.26.4

# Had to downgrade numpy < 2.0

1.26.4


##### Verifying .loom file

In [4]:
import loompy

input_loom_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.loom"

with loompy.connect(input_loom_file_path) as ds:
    # Print general metadata
    print("Row attributes:", ds.ra.keys())  # Should contain 'ensembl_id'
    print("Column attributes:", ds.ca.keys())  # Should contain 'n_counts'
    print("Data shape (genes x exp columns):", ds.shape)

    # Print first 5 genes (rows) and their attributes
    print("\nFirst 5 Row Attributes:")
    for key in ds.ra.keys():
        print(f"{key}: {ds.ra[key][:5]}")  # Print first 5 values of each row attribute

    # print(f"exp_name: {ds.ca['exp_name'][:1]}")
    print(f"n_counts: {ds.ca['n_counts'][:5]}")

Row attributes: ['ensembl_id']
Column attributes: ['n_counts']
Data shape (genes x exp columns): (6800, 11889)

First 5 Row Attributes:
ensembl_id: ['Q0010' 'Q0017' 'Q0032' 'Q0045' 'Q0050']
n_counts: [20989.98  19249.494 21266.008 19862.625 19453.807]


#### Generating Dictionaries

##### Tokens Dictionary

###### Load example to see the intuition

In [None]:
import pickle

# Path to the token dictionary file
token_dict_file = "/home/logs/jtorresb/Geneformer/geneformer/token_dictionary_gc95M.pkl"

# Function to inspect the token dictionary
def inspect_token_dictionary(file_path, num_samples=10):
    with open(file_path, "rb") as f:
        token_dict = pickle.load(f)
    
    print(f"Token dictionary type: {type(token_dict)}")
    print(f"Total tokens: {len(token_dict)}")
    print("First 10 token entries:")
    sample_items = list(token_dict.items())[:num_samples]
    for key, value in sample_items:
        print(f"{key}: {value}")

# Run the inspection
inspect_token_dictionary(token_dict_file)

###### Generating Token Dictionary

In [None]:
import pandas as pd
import pickle

# File paths
csv_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd.csv"
output_pkl = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_token_dict.pkl"

# Load CSV (Ensure YORFs are the index)
df = pd.read_csv(csv_file, sep='\t', index_col=0)

# Extract yeast ORFs (YORFs) from index
yorfs = df.index.tolist()

# Optional: Sort alphabetically for consistency
yorfs.sort()

# Initialize token dictionary with special tokens
token_dict = {
    "<pad>": 0,
    "<mask>": 1,
    "<cls>": 2,
    "<eos>": 3,
}

# Assign unique token IDs starting from 4
for i, gene_id in enumerate(yorfs, start=4):
    token_dict[gene_id] = i

# Save dictionary as a pickle file
with open(output_pkl, "wb") as f:
    pickle.dump(token_dict, f)

print(f"Token dictionary saved as: {output_pkl}")

##### Medians Dictionary

###### Inspecting example first

In [None]:
import pickle

# Path to the median dictionary file
median_dict_file = "/home/logs/jtorresb/Geneformer/geneformer/gene_median_dictionary_gc95M.pkl"

# Function to inspect the median dictionary
def inspect_median_dictionary(file_path, num_samples=10):
    with open(file_path, "rb") as f:
        median_dict = pickle.load(f)
    
    print(f"Median dictionary type: {type(median_dict)}")
    print(f"Total genes in dictionary: {len(median_dict)}")
    print("First 10 median entries:")
    sample_items = list(median_dict.items())[:num_samples]
    for key, value in sample_items:
        print(f"{key}: {value}")

# Run the inspection
inspect_median_dictionary(median_dict_file)

In [None]:
import pandas as pd
import numpy as np
import pickle

# File paths
csv_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv" # Copy already replaced NaNs by 0s
output_pkl = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_median_dict.pkl"

# Load CSV (genes as index, experiments as columns)
df = pd.read_csv(csv_file, sep='\t', index_col=0)

# Compute nonzero medians for each gene
median_dict = {}
for gene in df.index:
    nonzero_values = df.loc[gene][df.loc[gene] != 0]  # Ignore zeros
    if not nonzero_values.empty:
        median_dict[gene] = np.median(nonzero_values)  # Compute median
    else:
        median_dict[gene] = 0  # If all values are zero, set median to 0

# Save dictionary as a pickle file
with open(output_pkl, "wb") as f:
    pickle.dump(median_dict, f)

print(f"Median dictionary saved as: {output_pkl}")


#### Tokenizing Loom File

##### Quick model_input_size ChecK

In [None]:
import loompy
import pickle
import numpy as np

# File paths (update these as needed)
loom_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom"
median_dict_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_median_dict.pkl"
token_dict_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_token_dict.pkl"

# Load the median dictionary and token dictionary
with open(median_dict_file, "rb") as f:
    median_dict = pickle.load(f)
with open(token_dict_file, "rb") as f:
    token_dict = pickle.load(f)

# This function simulates tokenization for one cell:
# It normalizes each gene's expression by its nonzero median,
# then ranks the genes by the normalized expression (highest first),
# and finally maps the gene IDs to token IDs using token_dict.
def tokenize_cell(expr_vector, ensembl_ids):
    gene_norms = []
    for idx, expr in enumerate(expr_vector):
        # Only consider genes with nonzero expression
        if expr != 0:
            gene_id = ensembl_ids[idx]
            # Use the median if available and > 0; skip otherwise
            if gene_id in median_dict and median_dict[gene_id] > 0:
                norm_val = expr / median_dict[gene_id]
                gene_norms.append((gene_id, norm_val))
    # Sort genes by normalized value in descending order
    gene_norms_sorted = sorted(gene_norms, key=lambda x: -x[1])
    token_seq = []
    for gene_id, _ in gene_norms_sorted:
        # Only add tokens for genes that exist in the token dictionary.
        # (If a gene is missing, it will be skipped.)
        if gene_id in token_dict:
            token_seq.append(token_dict[gene_id])
    return token_seq

# Open the loom file and compute tokenized sequence lengths for a subset of cells.
token_lengths = []

with loompy.connect(loom_file_path) as ds:
    ensembl_ids = ds.ra["ensembl_id"]  # Array of gene IDs (rows)
    num_genes, num_cells = ds.shape
    print(f"Loom file shape (genes x cells): {ds.shape}")
    
    # Process a subset of cells (e.g., first 100 cells)
    num_cells_to_process = min(num_cells, 100)
    
    for cell_idx in range(num_cells_to_process):
        # Get the expression vector for the cell (all genes)
        expr_vector = ds[:, cell_idx].astype(np.float32)
        # Tokenize the cell's gene expression
        token_seq = tokenize_cell(expr_vector, ensembl_ids)
        token_lengths.append(len(token_seq))
    
# Print statistics about the tokenized sequence lengths
token_lengths = np.array(token_lengths)
print(f"Processed {num_cells_to_process} cells.")
print("Token sequence lengths (number of tokens per cell):")
print(token_lengths)
print(f"Average token sequence length: {np.mean(token_lengths):.1f}")
print(f"Median token sequence length: {np.median(token_lengths):.1f}")


##### Tokenization using TranscriptomeTokenizer

In [None]:
from geneformer import TranscriptomeTokenizer

tk = TranscriptomeTokenizer(custom_attr_name_dict=None, nproc=2, chunk_size=512, model_input_size=512, 
                            special_token=False, collapse_gene_ids=True, gene_median_file='/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_median_dict.pkl',
                            token_dictionary_file='/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_token_dict.pkl', gene_mapping_file=None)
# special_token = False was giving problems with "ensemble_ids_collapse"

<cls> and <eos> are in gene_token_dict but special_token = False. Please note that for 95M model series, special_token should be True.


In [6]:
tk.tokenize_data(data_directory="/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized",
                 output_directory="/home/logs/jtorresb/Geneformer/yeast/yeast_data/output",
                 output_prefix="unnormalized_yeast_master_matrix_sgd", 
                 file_format="loom",
                 use_generator=False)

Tokenizing /home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized/unnormalized_yeast_master_matrix_sgd.loom
/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized/unnormalized_yeast_master_matrix_sgd.loom has no column attribute 'filter_pass'; tokenizing all cells.


  subview[:, :]


Creating dataset.


##### Checking .dataset File

In [7]:
from datasets import load_from_disk

# Replace the path with the location of your .dataset file
dataset_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/unnormalized_yeast_master_matrix_sgd.dataset"
data = load_from_disk(dataset_path)

# Convert the dataset to a pandas DataFrame
df = data.to_pandas()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11889 entries, 0 to 11888
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   input_ids  11889 non-null  object
 1   length     11889 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 185.9+ KB


In [None]:
# Replace the path with the location of your .dataset file
dataset_path = "/home/logs/jtorresb/Geneformer/Genecorpus/example_input_files/gene_classification/dosage_sensitive_tfs/gc-30M_sample50k.dataset"
data = load_from_disk(dataset_path)

# Convert the dataset to a pandas DataFrame
df = data.to_pandas()

# Display the first few rows of the DataFrame
print(df.head(10))

#### Generating Example Lengths File for Pretraining

In [1]:
import pickle
from datasets import load_from_disk

# Define dataset path
dataset_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.dataset"
output_pickle_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_lengths.pkl"

# Load dataset
data = load_from_disk(dataset_path)

# Compute example lengths
example_lengths = [data[i]["length"] for i in range(len(data))]

# Save to pickle file
with open(output_pickle_path, "wb") as f:
    pickle.dump(example_lengths, f)

print(f"Example lengths saved to {output_pickle_path}")


Example lengths saved to /home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_lengths.pkl


In [3]:
import pickle

# Path to your generated pickle file
lengths_file = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_token_dict.pkl"

# Load the list from the pickle file
with open(lengths_file, "rb") as f:
    example_lengths = pickle.load(f)

# Print the first 10 values
print(example_lengths)


{'<pad>': 0, '<mask>': 1, '<cls>': 2, '<eos>': 3, 'Q0010': 4, 'Q0017': 5, 'Q0032': 6, 'Q0045': 7, 'Q0050': 8, 'Q0055': 9, 'Q0060': 10, 'Q0065': 11, 'Q0070': 12, 'Q0075': 13, 'Q0080': 14, 'Q0085': 15, 'Q0092': 16, 'Q0105': 17, 'Q0110': 18, 'Q0115': 19, 'Q0120': 20, 'Q0130': 21, 'Q0140': 22, 'Q0142': 23, 'Q0143': 24, 'Q0144': 25, 'Q0160': 26, 'Q0182': 27, 'Q0250': 28, 'Q0255': 29, 'Q0275': 30, 'Q0297': 31, 'R0010W': 32, 'R0020C': 33, 'R0030W': 34, 'R0040C': 35, 'YAL001C': 36, 'YAL002W': 37, 'YAL003W': 38, 'YAL004W': 39, 'YAL005C': 40, 'YAL007C': 41, 'YAL008W': 42, 'YAL009W': 43, 'YAL010C': 44, 'YAL011W': 45, 'YAL012W': 46, 'YAL013W': 47, 'YAL014C': 48, 'YAL015C': 49, 'YAL016C-A': 50, 'YAL016C-B': 51, 'YAL016W': 52, 'YAL017W': 53, 'YAL018C': 54, 'YAL019W': 55, 'YAL019W-A': 56, 'YAL020C': 57, 'YAL021C': 58, 'YAL022C': 59, 'YAL023C': 60, 'YAL024C': 61, 'YAL025C': 62, 'YAL026C': 63, 'YAL026C-A': 64, 'YAL027W': 65, 'YAL028W': 66, 'YAL029C': 67, 'YAL030W': 68, 'YAL031C': 69, 'YAL031W-A': 70, '