In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc

base_dir = "/mnt/projects/debruinz_project"
out_dir  = "/mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports"
os.makedirs(out_dir, exist_ok=True)

tissues = ["Blood", "Bone_Marrow", "Lung", "Mammary", "Thymus"]

# (short_name, folder_name, filename_pattern, H_key_in_obsm)
models = [
    ("NMF",  "Base NMF", "sklearn_nmf_k80_baseNMF_{tissue}.h5ad", "H_sklearn_nmf_k80"),
    ("NNAE", "AE NMF",   "tied_nmf_k80_no_cond_{tissue}.h5ad",     "H_shared"),
]

# try these in order; first one found will be used
CELLTYPE_COL_CANDIDATES = [
    "cell_type",
    "celltype",
    "CellType",
    "cell_type_label",
    "cell_type_ontology_term_id",
]

# UMAP / neighbors params (match your paper defaults)
N_NEIGHBORS = 15
METRIC      = "euclidean"
MIN_DIST    = 0.5
SPREAD      = 1.0

def pick_celltype_col(obs_cols):
    for c in CELLTYPE_COL_CANDIDATES:
        if c in obs_cols:
            return c
    return None

for tissue in tissues:
    for model_short, folder, pattern, h_key in models:
        h5ad_path = os.path.join(base_dir, folder, pattern.format(tissue=tissue))
        if not os.path.exists(h5ad_path):
            print(f"[WARN] missing: {h5ad_path}")
            continue

        print(f"[LOAD] {h5ad_path}")
        adata = sc.read_h5ad(h5ad_path)

        if h_key not in adata.obsm:
            raise KeyError(f"{h_key} not in obsm for {h5ad_path}. keys={list(adata.obsm.keys())}")

        ct_col = pick_celltype_col(adata.obs.columns)
        if ct_col is None:
            raise KeyError(
                f"No cell-type column found in obs for {h5ad_path}. "
                f"Available obs cols (first 50): {list(adata.obs.columns)[:50]}"
            )

        # Use embeddings (H) for UMAP
        X_emb = np.asarray(adata.obsm[h_key])
        adata.obsm["X_emb"] = X_emb

        # Compute neighbors + UMAP
        sc.pp.neighbors(adata, use_rep="X_emb", n_neighbors=N_NEIGHBORS, metric=METRIC)
        sc.tl.umap(adata, min_dist=MIN_DIST, spread=SPREAD)

        # Extract UMAP coords
        um = adata.obsm["X_umap"]
        df = pd.DataFrame({
            "tissue": tissue,
            "model": model_short,
            "cell_id": adata.obs_names.astype(str),
            "cell_type": adata.obs[ct_col].astype(str).values,
            "umap_1": um[:, 0],
            "umap_2": um[:, 1],
        })

        out_csv = os.path.join(out_dir, f"tsv2_umap_{tissue}_{model_short}_celltype.csv")
        df.to_csv(out_csv, index=False)
        print(f"[OK] wrote {out_csv}  rows={df.shape[0]} cols={df.shape[1]}")


[LOAD] /mnt/projects/debruinz_project/Base NMF/sklearn_nmf_k80_baseNMF_Blood.h5ad


  from .autonotebook import tqdm as notebook_tqdm


[OK] wrote /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports/tsv2_umap_Blood_NMF_celltype.csv  rows=17802 cols=6
[LOAD] /mnt/projects/debruinz_project/AE NMF/tied_nmf_k80_no_cond_Blood.h5ad
[OK] wrote /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports/tsv2_umap_Blood_NNAE_celltype.csv  rows=17802 cols=6
[LOAD] /mnt/projects/debruinz_project/Base NMF/sklearn_nmf_k80_baseNMF_Bone_Marrow.h5ad
[OK] wrote /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports/tsv2_umap_Bone_Marrow_NMF_celltype.csv  rows=8045 cols=6
[LOAD] /mnt/projects/debruinz_project/AE NMF/tied_nmf_k80_no_cond_Bone_Marrow.h5ad
[OK] wrote /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports/tsv2_umap_Bone_Marrow_NNAE_celltype.csv  rows=8045 cols=6
[LOAD] /mnt/projects/debruinz_project/Base NMF/sklearn_nmf_k80_baseNMF_Lung.h5ad
[OK] wrote /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/umap_exports/tsv2_u