In [2]:
# Import libraries 
import os 
import numpy as np 
import pandas as pd
# Define paths
weights_npy = "/mnt/projects/debruinz_project/bisholea/capstone/encoder_weight_matrix_filtered_60M.npy"
names_npy   = "/mnt/projects/debruinz_project/bisholea/capstone/gene_names_filtered_60M.npy"
out_dir     = "/mnt/projects/debruinz_project/bisholea/capstone/gsea"
os.makedirs(out_dir, exist_ok=True)

# Load filtered weights (genes x factors) and filtered names (genes,)
W = np.load(weights_npy)                       # shape like (40508, 256)
names = np.load(names_npy, allow_pickle=True).astype(str)

# If orientation is [factors x genes], transpose
if W.shape[0] != len(names) and W.shape[1] == len(names):
    W = W.T
assert W.shape[0] == len(names), f"weights rows {W.shape[0]} != names {len(names)}"

# Optional: drop duplicate symbols to keep fgsea happy (keep first)
names_s = pd.Series(names, dtype=str).str.strip()
dup = names_s.duplicated(keep="first")
if dup.any():
    keep = ~dup
    names_s = names_s[keep].reset_index(drop=True)
    W = W[keep.values]

# Write exactly what the R script expects
emb_df = pd.DataFrame(W, columns=[f"F{i+1}" for i in range(W.shape[1])])
var_df = pd.DataFrame({"feature_name": names_s})

emb_df.to_csv(os.path.join(out_dir, "ae_adata_var_embeddings_unique.csv"), index=False)
var_df.to_csv(os.path.join(out_dir, "ae_adata_var_metadata_unique.csv"), index=False)

print("Done. Shapes:", emb_df.shape, var_df.shape)


Done. Shapes: (40508, 256) (40508, 1)
