In [3]:
import os
import numpy as np
import pandas as pd
import cellxgene_census

OUT_DIR = "/mnt/projects/debruinz_project/bisholea/capstone/gsea"
EMB_CSV = os.path.join(OUT_DIR, "als_nmf_var_embeddings_unique.csv")   # factors only
VAR_CSV = os.path.join(OUT_DIR, "als_nmf_var_metadata_unique.csv")     # includes feature_name (+ extras)
os.makedirs(OUT_DIR, exist_ok=True)

OBS_TINY_N = 1000  # small obs subset so X stays tiny; W in .varm is global

with cellxgene_census.open_soma(census_version="2023-12-15") as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism="homo_sapiens",
        measurement_name="RNA",
        # IMPORTANT: pass a LIST[int], not a tuple
        obs_coords=list(range(OBS_TINY_N)),
        var_embeddings=["nmf"],
        var_column_names=None,   # fetch ALL var columns for your metadata CSV
        obs_column_names=()      # no obs cols needed
        # don't pass X_name=None on this build
    )

# --- Extract W (genes × factors) from .varm
nmf_keys = [k for k in adata.varm.keys() if "nmf" in k.lower()]
if not nmf_keys:
    raise KeyError(f"No NMF var-embedding found in .varm; keys: {list(adata.varm.keys())}")
key = sorted(nmf_keys, key=len)[0]
W = np.nan_to_num(np.asarray(adata.varm[key]), copy=False)  # (n_genes, n_factors)

# --- Ensure we have 'feature_name'
if "feature_name" not in adata.var.columns:
    adata.var.insert(0, "feature_name", pd.Series(adata.var_names, index=adata.var.index).astype(str))
else:
    adata.var["feature_name"] = adata.var["feature_name"].astype(str)

# --- Make feature_name unique
if adata.var["feature_name"].duplicated().any():
    fn = adata.var["feature_name"]
    adata.var["feature_name"] = fn.where(~fn.duplicated(), fn + "_" + fn.groupby(fn).cumcount().astype(str))

# --- Write embeddings: factors ONLY (to match your AE pipeline)
n_factors = W.shape[1]
factor_cols = [f"F{i}" for i in range(1, n_factors + 1)]
pd.DataFrame(W.astype("float32"), columns=factor_cols).to_csv(EMB_CSV, index=False)

# --- Write ALL var columns (same row order)
adata.var.to_csv(VAR_CSV, index=False)

print(f"[OK] ALS NMF embeddings -> {EMB_CSV}  shape={(W.shape[0], W.shape[1])}")
print(f"[OK] ALS NMF var metadata -> {VAR_CSV}  rows={adata.var.shape[0]}  cols={adata.var.shape[1]}")




[OK] ALS NMF embeddings -> /mnt/projects/debruinz_project/bisholea/capstone/gsea/als_nmf_var_embeddings_unique.csv  shape=(60664, 200)
[OK] ALS NMF var metadata -> /mnt/projects/debruinz_project/bisholea/capstone/gsea/als_nmf_var_metadata_unique.csv  rows=60664  cols=6


In [5]:
import pandas as pd

df = pd.read_csv("/mnt/projects/debruinz_project/bisholea/capstone/gsea/als_nmf_var_embeddings_unique.csv")
print(df.head())         # View first 5 rows
print(df.columns)        # View column names
print(df.describe())     # Quick stats for numeric columns

df = pd.read_csv("/mnt/projects/debruinz_project/bisholea/capstone/gsea/als_nmf_var_metadata_unique.csv")
print(df.head())         # View first 5 rows
print(df.columns)        # View column names
print(df.describe())     # Quick stats for numeric columns

    F1        F2   F3        F4   F5   F6        F7   F8        F9  F10  ...  \
0  0.0  0.000009  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000005  0.0  ...   
1  0.0  0.000027  0.0  0.000000  0.0  0.0  0.000012  0.0  0.000000  0.0  ...   
2  0.0  0.000010  0.0  0.000007  0.0  0.0  0.000084  0.0  0.000000  0.0  ...   
3  0.0  0.000000  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.0  ...   
4  0.0  0.000017  0.0  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.0  ...   

   F191      F192  F193  F194      F195  F196  F197      F198      F199  F200  
0   0.0  0.000000   0.0   0.0  0.000000   0.0   0.0  0.000000  0.000000   0.0  
1   0.0  0.000000   0.0   0.0  0.000000   0.0   0.0  0.000572  0.000000   0.0  
2   0.0  0.000017   0.0   0.0  0.000000   0.0   0.0  0.000000  0.000000   0.0  
3   0.0  0.000000   0.0   0.0  0.000000   0.0   0.0  0.000000  0.000045   0.0  
4   0.0  0.000000   0.0   0.0  0.000014   0.0   0.0  0.000000  0.000000   0.0  

[5 rows x 200 columns]
Index(['F1', 'F

In [1]:
import os
import numpy as np
import pandas as pd
import cellxgene_census

OUT_DIR = "/mnt/projects/debruinz_project/bisholea/capstone/gsea"
OBS_CSV = os.path.join(OUT_DIR, "als_nmf_obs_embeddings_unique.csv")   # cells × factors
os.makedirs(OUT_DIR, exist_ok=True)

OBS_TINY_N = 10000  # number of cells to grab (you can scale up)

with cellxgene_census.open_soma(census_version="2023-12-15") as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism="homo_sapiens",
        measurement_name="RNA",
        obs_coords=list(range(OBS_TINY_N)),   # how many cells to pull
        obs_embeddings=["nmf"],               # <--- this pulls H into .obsm
        var_column_names=(),
        obs_column_names=None                 # you can add obs metadata if you want
    )

# --- Extract H (cells × factors) from .obsm
nmf_keys = [k for k in adata.obsm.keys() if "nmf" in k.lower()]
if not nmf_keys:
    raise KeyError(f"No NMF obs-embedding found in .obsm; keys: {list(adata.obsm.keys())}")
key = sorted(nmf_keys, key=len)[0]

H = np.nan_to_num(np.asarray(adata.obsm[key]), copy=False)  # (n_cells, n_factors)

# --- Save to CSV
n_factors = H.shape[1]
factor_cols = [f"F{i}" for i in range(1, n_factors + 1)]
pd.DataFrame(H.astype("float32"), columns=factor_cols).to_csv(OBS_CSV, index=False)

print(f"[OK] ALS NMF obs embeddings -> {OBS_CSV}  shape={(H.shape[0], H.shape[1])}")


[OK] ALS NMF obs embeddings -> /mnt/projects/debruinz_project/bisholea/capstone/gsea/als_nmf_obs_embeddings_unique.csv  shape=(10000, 200)
