In [1]:
import os
import scanpy as sc

immune_dir = "/mnt/projects/debruinz_project/Tabula_Immune"
files = [
    "results_only_sklearn_nmf_k80.h5ad",
    "results_only_no_cond_k80_row_norm_false.h5ad",
]

for fn in files:
    path = os.path.join(immune_dir, fn)
    print("\n" + "="*80)
    print("[FILE]", path)
    adata = sc.read_h5ad(path)

    print("[SHAPE] X:", adata.shape)  # (n_cells, n_genes) if X present
    print("[OBSM keys]", list(adata.obsm.keys()))
    print("[VARM keys]", list(adata.varm.keys()))
    print("[OBS cols first 30]", list(adata.obs.columns)[:30])
    print("[VAR names]", len(adata.var_names), "genes; first 5:", list(adata.var_names[:5]))



[FILE] /mnt/projects/debruinz_project/Tabula_Immune/results_only_sklearn_nmf_k80.h5ad
[SHAPE] X: (579027, 60606)
[OBSM keys] ['H_sklearn_nmf_k80']
[VARM keys] ['W_sklearn_nmf_k80']
[OBS cols first 30] ['donor_id', 'cell_type']
[VAR names] 60606 genes; first 5: ['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460']

[FILE] /mnt/projects/debruinz_project/Tabula_Immune/results_only_no_cond_k80_row_norm_false.h5ad
[SHAPE] X: (579027, 60606)
[OBSM keys] ['H_shared_k80']
[VARM keys] ['W_tied_k80']
[OBS cols first 30] ['donor_id', 'cell_type']
[VAR names] 60606 genes; first 5: ['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460']


In [2]:
import os
import numpy as np
import pandas as pd
import scanpy as sc

# ------------------------
# Config
# ------------------------
immune_dir = "/mnt/projects/debruinz_project/Tabula_Immune"

# Output paths (rename so you don't overwrite TSV2 outputs)
out_hw_csv = (
    "/mnt/projects/debruinz_project/bisholea/capstone/"
    "tsv2_benchmarks/immune_k80_nmf_nnae_H_W_stats.csv"
)
out_diag_csv = (
    "/mnt/projects/debruinz_project/bisholea/capstone/"
    "tsv2_benchmarks/immune_k80_nmf_diag_loadings.csv"
)

# Single tissue label (constant)
TISSUE = "Immune"

# Models: (model_code_for_R, filename, H_key_in_obsm, W_key_in_varm)
models = [
    # Base NMF
    ("NMF", "results_only_sklearn_nmf_k80.h5ad",
     "H_sklearn_nmf_k80", "W_sklearn_nmf_k80"),
    # AE NMF (NNAE)
    ("AE",  "results_only_no_cond_k80_row_norm_false.h5ad",
     "H_shared_k80", "W_tied_k80"),
]

hw_rows = []
diag_rows = []

for model_code, filename, h_key, w_key in models:
    h5ad_path = os.path.join(immune_dir, filename)
    if not os.path.exists(h5ad_path):
        raise FileNotFoundError(f"Missing: {h5ad_path}")

    print(f"[LOAD] {h5ad_path}")
    adata = sc.read_h5ad(h5ad_path)

    # --- H ---
    if h_key not in adata.obsm:
        raise KeyError(
            f"{h_key} not found in adata.obsm for {h5ad_path}. "
            f"Available obsm keys: {list(adata.obsm.keys())}"
        )
    H = np.asarray(adata.obsm[h_key])   # cells × factors

    # --- W ---
    if w_key not in adata.varm:
        raise KeyError(
            f"{w_key} not found in adata.varm for {h5ad_path}. "
            f"Available varm keys: {list(adata.varm.keys())}"
        )
    W = np.asarray(adata.varm[w_key])   # genes × factors

    n_cells, k_H = H.shape
    n_genes, k_W = W.shape
    if k_H != k_W:
        raise ValueError(f"k mismatch for {h5ad_path}: H has {k_H}, W has {k_W}")
    k = k_H
    print(f"[OK] H shape: {H.shape}, W shape: {W.shape}")

    # -------- sparsity and norms for H and W --------
    # H: cells × k
    H_zero = (H == 0)
    pct_zero_H = H_zero.sum(axis=0) / float(n_cells) * 100.0
    l2_H = np.sqrt((H ** 2).sum(axis=0))
    l1_H = np.abs(H).sum(axis=0)

    # W: genes × k
    W_zero = (W == 0)
    pct_zero_W = W_zero.sum(axis=0) / float(n_genes) * 100.0
    l2_W = np.sqrt((W ** 2).sum(axis=0))
    l1_W = np.abs(W).sum(axis=0)

    for f in range(k):
        hw_rows.append({
            "model": model_code,   # "AE" or "NMF"
            "tissue": TISSUE,      # constant
            "factor": f,           # 0..k-1
            "matrix": "H",
            "pct_zero": float(pct_zero_H[f]),
            "l1": float(l1_H[f]),
            "l2": float(l2_H[f]),
        })
        hw_rows.append({
            "model": model_code,
            "tissue": TISSUE,
            "factor": f,
            "matrix": "W",
            "pct_zero": float(pct_zero_W[f]),
            "l1": float(l1_W[f]),
            "l2": float(l2_W[f]),
        })

    # -------- NMF diagonal loadings (W/H scaled to unit L2) --------
    if model_code == "NMF":
        eps = 1e-12
        inv_l2_W = 1.0 / np.maximum(l2_W, eps)
        inv_l2_H = 1.0 / np.maximum(l2_H, eps)
        diag_vals = inv_l2_W * inv_l2_H  # length k

        for f in range(k):
            diag_rows.append({
                "model": model_code,
                "tissue": TISSUE,
                "factor": f,
                "diag_loading": float(diag_vals[f]),
            })

# ---- save H/W stats ----
hw_df = pd.DataFrame(hw_rows)
print(hw_df.head())
print("Total H/W rows:", len(hw_df))
os.makedirs(os.path.dirname(out_hw_csv), exist_ok=True)
hw_df.to_csv(out_hw_csv, index=False)
print("Wrote H/W stats to:", out_hw_csv)

# ---- save diagonal loadings ----
diag_df = pd.DataFrame(diag_rows)
print(diag_df.head())
print("Total diagonal rows:", len(diag_df))
os.makedirs(os.path.dirname(out_diag_csv), exist_ok=True)
diag_df.to_csv(out_diag_csv, index=False)
print("Wrote diagonal loadings to:", out_diag_csv)


[LOAD] /mnt/projects/debruinz_project/Tabula_Immune/results_only_sklearn_nmf_k80.h5ad
[OK] H shape: (579027, 80), W shape: (60606, 80)
[LOAD] /mnt/projects/debruinz_project/Tabula_Immune/results_only_no_cond_k80_row_norm_false.h5ad
[OK] H shape: (579027, 80), W shape: (60606, 80)
  model  tissue  factor matrix   pct_zero            l1          l2
0   NMF  Immune       0      H  27.787133   2013.937622    3.977607
1   NMF  Immune       0      W  94.713395  18666.533203  968.526062
2   NMF  Immune       1      H  62.933507   3339.510498   13.088919
3   NMF  Immune       1      W  65.719566   9073.647461  308.197052
4   NMF  Immune       2      H  62.324037   1010.316895    3.418681
Total H/W rows: 320
Wrote H/W stats to: /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/immune_k80_nmf_nnae_H_W_stats.csv
  model  tissue  factor  diag_loading
0   NMF  Immune       0      0.000260
1   NMF  Immune       1      0.000248
2   NMF  Immune       2      0.000509
3   NMF  Immune     

In [2]:
# ==========================================================
# Immune: Between-factor cosine similarity (W^T W, H^T H)
# ==========================================================

import os
import numpy as np
import pandas as pd
import scanpy as sc

# ------------------------
# Config
# ------------------------

immune_dir = "/mnt/projects/debruinz_project/Tabula_Immune"

out_between_csv = (
    "/mnt/projects/debruinz_project/bisholea/capstone/"
    "tsv2_benchmarks/immune_k80_nmf_nnae_between_factor_cosine.csv"
)

# single tissue label for downstream R code compatibility
TISSUE = "Immune"

models = [
    # Base NMF
    ("NMF",
     "results_only_sklearn_nmf_k80.h5ad",
     "H_sklearn_nmf_k80", "W_sklearn_nmf_k80"),
    # AE NMF (NNAE)
    ("AE",
     "results_only_no_cond_k80_row_norm_false.h5ad",
     "H_shared_k80", "W_tied_k80"),
]

# ------------------------
# Helpers
# ------------------------

def offdiag_cosine_cols(A):
    """
    A: (n x k), factors are columns.
    Returns off-diagonal cosine similarities from A_norm^T A_norm.
    """
    A = np.asarray(A, dtype=np.float64)
    denom = np.linalg.norm(A, axis=0)
    denom[denom == 0] = np.nan
    An = A / denom
    G = An.T @ An
    iu = np.triu_indices(G.shape[0], k=1)
    v = G[iu]
    return v[np.isfinite(v)]

# ------------------------
# Compute similarities
# ------------------------

rows = []

for model_code, filename, h_key, w_key in models:
    h5ad_path = os.path.join(immune_dir, filename)
    if not os.path.exists(h5ad_path):
        print(f"[WARN] Missing file: {h5ad_path}")
        continue

    print(f"Processing {model_code} (Immune)")
    adata = sc.read_h5ad(h5ad_path)

    # --- H and W ---
    if h_key not in adata.obsm:
        raise KeyError(f"{h_key} not in obsm for {h5ad_path}. keys={list(adata.obsm.keys())}")
    if w_key not in adata.varm:
        raise KeyError(f"{w_key} not in varm for {h5ad_path}. keys={list(adata.varm.keys())}")

    H = np.asarray(adata.obsm[h_key])   # cells x k
    W = np.asarray(adata.varm[w_key])   # genes x k

    # sanity check
    if H.shape[1] != W.shape[1]:
        raise ValueError(f"k mismatch in {h5ad_path}: H={H.shape}, W={W.shape}")

    # H^T H (factor-factor)
    vH = offdiag_cosine_cols(H)
    for x in vH:
        rows.append({
            "model": model_code,
            "tissue": TISSUE,
            "matrix": "H",
            "cos_sim": float(x),
        })

    # W^T W (factor-factor)
    vW = offdiag_cosine_cols(W)
    for x in vW:
        rows.append({
            "model": model_code,
            "tissue": TISSUE,
            "matrix": "W",
            "cos_sim": float(x),
        })

# ------------------------
# Save
# ------------------------

between_df = pd.DataFrame(rows)
print(between_df.head())
print("Total rows:", len(between_df), " (expected 2 models * 2 matrices * C(80,2) =",
      2 * 2 * (80 * 79 // 2), ")")

os.makedirs(os.path.dirname(out_between_csv), exist_ok=True)
between_df.to_csv(out_between_csv, index=False)
print("Wrote:", out_between_csv)


Processing NMF (Immune)
Processing AE (Immune)
  model  tissue matrix   cos_sim
0   NMF  Immune      H  0.036067
1   NMF  Immune      H  0.150158
2   NMF  Immune      H  0.259196
3   NMF  Immune      H  0.274047
4   NMF  Immune      H  0.080470
Total rows: 12640  (expected 2 models * 2 matrices * C(80,2) = 12640 )
Wrote: /mnt/projects/debruinz_project/bisholea/capstone/tsv2_benchmarks/immune_k80_nmf_nnae_between_factor_cosine.csv
