### Bulding Geneformer-Like Dataset From Yeast Master Matrix

#### Building .loom file

##### Replacing NaNs by 0s

In [None]:
import pandas as pd

file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv"

# Load CSV
df = pd.read_csv(file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# print(df.shape)

# Replace NaNs with 0
df.fillna(0, inplace=True)

# Save the cleaned CSV
df.to_csv(file_path, sep='\t')

print("NaNs replaced with 0")

##### Translating to .loom

In [1]:
import pandas as pd
import numpy as np
import loompy

input_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/original_yeast_master_matrix_sgd_copy.csv"
output_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom"

# Load CSV
df = pd.read_csv(input_file_path, sep='\t', index_col=0) # Important to keep sep='\t', since that's how it was saved 

# Compute total read counts for each experiment (equivalent to cell in geneformer)
n_counts = df.sum(axis=0).astype(np.float32)  # Sum across genes for each column 

# Prepare row attributes (Gene IDs → Ensembl IDs assumed to be index)
row_attrs = {"ensembl_id": df.index.tolist()}  # Ensure index has Ensembl IDs

# Prepare column attributes (Cells & their total read counts)
col_attrs = {
    # "exp_name": df.columns.tolist(),  # Experiment names
    "n_counts": n_counts.values,   # Total counts per column (experiment)
}

# Convert DataFrame to Loom format & save
loompy.create(output_file_path, df.values.astype(np.float32), row_attrs, col_attrs)

print(f"Loom file saved as: {output_file_path}")

Loom file saved as: /home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom


In [None]:
import numpy as np
print(np.__version__)  # Should print something like 1.26.4

# Had to downgrade numpy < 2.0

##### Verifying .loom file

In [3]:
import loompy

input_loom_file_path = "/home/logs/jtorresb/Geneformer/yeast/yeast_data/output/yeast_master_matrix_sgd.loom"

with loompy.connect(input_loom_file_path) as ds:
    # Print general metadata
    print("Row attributes:", ds.ra.keys())  # Should contain 'ensembl_id'
    print("Column attributes:", ds.ca.keys())  # Should contain 'n_counts'
    print("Data shape (genes x exp columns):", ds.shape)

    # Print first 5 genes (rows) and their attributes
    print("\nFirst 5 Row Attributes:")
    for key in ds.ra.keys():
        print(f"{key}: {ds.ra[key][:5]}")  # Print first 5 values of each row attribute

    # print(f"exp_name: {ds.ca['exp_name'][:1]}")
    print(f"n_counts: {ds.ca['n_counts'][:5]}")

Row attributes: ['ensembl_id']
Column attributes: ['n_counts']
Data shape (genes x exp columns): (6800, 11889)

First 5 Row Attributes:
ensembl_id: ['Q0010' 'Q0017' 'Q0032' 'Q0045' 'Q0050']
n_counts: [ 4.2784341e-05 -1.0654714e-05 -5.3722510e-05 -3.1103202e-05
 -7.0136011e-05]
