In [1]:
# ============================================================
# QA Level2_final coherence + QA específica para HSCs
# Salidas (summary_tables_final/):
#  1) QA_Level2final_neighbor_purity.csv
#  2) QA_HSCs_neighbor_label_distribution.csv
#  3) QA_Level2final_umap_compactness.csv
#  4) QA_HSCs_marker_panel_vs_rest.csv   (+ recomendación automática de 2 marcadores)
# ============================================================

from pathlib import Path
import json
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
DATA_PROCESSED = PROJECT_ROOT / "data_processed"
OUT_SUM = PROJECT_ROOT / "summary_tables_final"
OUT_SUM.mkdir(exist_ok=True)

OUT_FILTER = DATA_PROCESSED / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad"
MAP_PATH = OUT_SUM / "Level2_final_map.json"
EMB_PATH = OUT_SUM / "UMAP_Harmony_embeddings.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("OUT_FILTER :", OUT_FILTER)
print("MAP_PATH   :", MAP_PATH)
print("EMB_PATH   :", EMB_PATH)

if not OUT_FILTER.exists():
    raise FileNotFoundError(f"No existe OUT_FILTER: {OUT_FILTER}")
if not MAP_PATH.exists():
    raise FileNotFoundError(f"No existe Level2_final_map.json: {MAP_PATH}")
if not EMB_PATH.exists():
    raise FileNotFoundError(f"No existe UMAP_Harmony_embeddings.csv: {EMB_PATH}")

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_map = json.load(f)

# -----------------------------
# 1) Coherencia Level2_final vía pureza de vecinos (grafo Harmony)
# -----------------------------
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")

need = ["patientID", "disease", "Level1_refined", "Level2"]
missing = [c for c in need if c not in adata_b.obs.columns]
if missing:
    adata_b.file.close()
    raise KeyError(f"Faltan columnas obs: {missing}")

obs = adata_b.obs[need].copy()

# IMPORTANT: NO convertir Level2 a str antes de mapear (para no convertir NaN -> "nan")
l2_obj = obs["Level2"].astype("object")
obs["Level2_final"] = l2_obj.replace(level2_map)
obs["Level2_final"] = obs["Level2_final"].astype("object")

# Elegir conectividades: preferir harmony_connectivities si existe
if "harmony_connectivities" in adata_b.obsp.keys():
    C = adata_b.obsp["harmony_connectivities"]
    graph_name = "harmony_connectivities"
elif "connectivities" in adata_b.obsp.keys():
    C = adata_b.obsp["connectivities"]
    graph_name = "connectivities"
else:
    adata_b.file.close()
    raise KeyError("No encuentro connectivities ni harmony_connectivities en obsp.")

print(f"[OK] Usando grafo: obsp['{graph_name}'] -> {C.shape}")

# K: si está disponible en uns['neighbors'], usarlo; si no, fallback a 15
K = 15
try:
    K = int(adata_b.uns.get("neighbors", {}).get("params", {}).get("n_neighbors", 15))
except Exception:
    K = 15
print("K (n_neighbors):", K)

def topk_neighbors_from_connectivities(C, k=15):
    if not sp.issparse(C):
        C = sp.csr_matrix(C)
    C = C.tocsr()
    n = C.shape[0]
    neigh_idx = []

    for i in range(n):
        start, end = C.indptr[i], C.indptr[i + 1]
        cols = C.indices[start:end]
        data = C.data[start:end]

        # quitar self
        m = cols != i
        cols = cols[m]
        data = data[m]

        if cols.size == 0:
            neigh_idx.append(np.array([], dtype=int))
            continue

        # limitar a top-k por peso
        if cols.size > k:
            sel = np.argpartition(-data, k - 1)[:k]
            cols = cols[sel]
            data = data[sel]
            ord2 = np.argsort(-data)
            cols = cols[ord2]

        neigh_idx.append(cols.astype(int))

    return neigh_idx

neigh_idx = topk_neighbors_from_connectivities(C, k=K)

labels = obs["Level2_final"].astype(str).to_numpy()
all_levels = pd.Index(sorted(pd.unique(labels)))

# per-cell same-label fraction
same_frac = np.full(labels.shape[0], np.nan, dtype=float)
for i in range(labels.shape[0]):
    nb = neigh_idx[i]
    if nb.size == 0:
        continue
    same_frac[i] = np.mean(labels[nb] == labels[i])

df_purity = []
for lvl in all_levels:
    m = labels == lvl
    v = same_frac[m]
    df_purity.append({
        "Level2_final": lvl,
        "n_cells": int(m.sum()),
        "mean_sameLabel_inNeighbors": float(np.nanmean(v)),
        "median_sameLabel_inNeighbors": float(np.nanmedian(v)),
        "p25_sameLabel_inNeighbors": float(np.nanpercentile(v, 25)),
        "p75_sameLabel_inNeighbors": float(np.nanpercentile(v, 75)),
        "na_cells": int(np.isnan(v).sum()),
    })

df_purity = pd.DataFrame(df_purity).sort_values(
    ["mean_sameLabel_inNeighbors", "n_cells"], ascending=[True, False]
)

purity_path = OUT_SUM / "QA_Level2final_neighbor_purity.csv"
df_purity.to_csv(purity_path, index=False)
print("\nSaved:", purity_path)
print(df_purity.head(10).to_string(index=False))

# distribución de etiquetas vecinas para HSCs (¿con quién se “pega”?)
hsc_mask = labels == "HSCs"
if int(hsc_mask.sum()) > 0:
    counts = {}
    idx_hsc = np.where(hsc_mask)[0]
    for i in idx_hsc:
        for j in neigh_idx[i]:
            lab = labels[j]
            counts[lab] = counts.get(lab, 0) + 1

    df_hsc_nb = (
        pd.DataFrame({"neighbor_label": list(counts.keys()), "n_edges": list(counts.values())})
          .sort_values("n_edges", ascending=False)
    )
    df_hsc_nb["share"] = df_hsc_nb["n_edges"] / df_hsc_nb["n_edges"].sum()

    hsc_nb_path = OUT_SUM / "QA_HSCs_neighbor_label_distribution.csv"
    df_hsc_nb.to_csv(hsc_nb_path, index=False)
    print("\nSaved:", hsc_nb_path)
    print(df_hsc_nb.head(10).to_string(index=False))
else:
    print("\n[WARN] No hay HSCs en Level2_final (n=0). Se omite QA_HSCs_neighbor_label_distribution.csv")

# -----------------------------
# 2) Compactación en UMAP (sin colores): radio medio a centroide
# -----------------------------
# Si por cualquier razón el CSV no tuviera Level2_final, lo reconstruimos
emb_cols = pd.read_csv(EMB_PATH, nrows=1).columns.tolist()
usecols = ["UMAP1_harmony", "UMAP2_harmony"]
if "Level2_final" in emb_cols:
    usecols = ["Level2_final"] + usecols
elif "Level2" in emb_cols:
    usecols = ["Level2"] + usecols
else:
    raise KeyError("UMAP_Harmony_embeddings.csv no contiene ni Level2_final ni Level2.")

emb = pd.read_csv(EMB_PATH, usecols=usecols)

if "Level2_final" not in emb.columns:
    emb["Level2_final"] = emb["Level2"].astype("object").replace(level2_map).astype(str)

def compactness(df):
    x = df[["UMAP1_harmony", "UMAP2_harmony"]].to_numpy(float)
    c = x.mean(axis=0)
    d = np.sqrt(((x - c) ** 2).sum(axis=1))
    return pd.Series({
        "n_cells": int(x.shape[0]),
        "mean_radius": float(d.mean()),
        "median_radius": float(np.median(d)),
        "p75_radius": float(np.percentile(d, 75)),
        "p95_radius": float(np.percentile(d, 95)),
    })

df_comp = emb.groupby("Level2_final", sort=False).apply(compactness).reset_index()
comp_path = OUT_SUM / "QA_Level2final_umap_compactness.csv"
df_comp.to_csv(comp_path, index=False)
print("\nSaved:", comp_path)
print(df_comp.sort_values("mean_radius", ascending=False).head(10).to_string(index=False))

# -----------------------------
# 3) Panel numérico para HSCs: marcadores alternativos vs resto
#    (y recomendación automática de 2 genes para sustituir CD34/KIT si salen mal)
# -----------------------------
if int(hsc_mask.sum()) == 0:
    print("\n[WARN] No hay HSCs -> se omite panel de marcadores HSCs.")
    adata_b.file.close()
    raise SystemExit(0)

CANDIDATES = [
    "CD34","KIT",
    "SOX4","SPINK2","AVP",
    "GATA2","LMO2","TAL1","MEIS1","MPL","HOPX",
    "HMGB2","TMSB10","TYMP"
]

def symbol_to_varname(adata, symbol):
    if symbol in adata.var_names:
        return symbol
    if "symbol" in adata.var.columns:
        sym = adata.var["symbol"].astype(str).to_numpy()
        hits = np.where(sym == symbol)[0]
        if hits.size > 0:
            return adata.var_names[hits[0]]
    return None

varnames = []
sym_kept = []
for s in CANDIDATES:
    v = symbol_to_varname(adata_b, s)
    if v is not None:
        varnames.append(v)
        sym_kept.append(s)

print("\nHSC panel: símbolos encontrados =", len(sym_kept), "/", len(CANDIDATES))
if len(sym_kept) == 0:
    adata_b.file.close()
    raise RuntimeError("No encontré ninguno de los genes candidatos en var_names/var['symbol'].")

adata_small = adata_b[:, varnames].to_memory()
adata_b.file.close()

# elegir matriz a usar
X = adata_small.layers["log1p_10k"] if "log1p_10k" in adata_small.layers.keys() else adata_small.X
if not sp.issparse(X):
    X = sp.csr_matrix(X)
X = X.tocsr()

mask_hsc = (labels == "HSCs")
mask_rest = ~mask_hsc

def mean_and_frac(Xsub):
    mean = np.asarray(Xsub.mean(axis=0)).ravel()
    frac = np.asarray((Xsub > 0).mean(axis=0)).ravel()
    return mean, frac

mean_h, frac_h = mean_and_frac(X[mask_hsc])
mean_r, frac_r = mean_and_frac(X[mask_rest])

df_hsc_panel = pd.DataFrame({
    "marker": sym_kept,
    "mean_log1p_HSCs": mean_h,
    "frac_nonzero_HSCs": frac_h,
    "mean_log1p_rest": mean_r,
    "frac_nonzero_rest": frac_r,
})
df_hsc_panel["delta_mean"] = df_hsc_panel["mean_log1p_HSCs"] - df_hsc_panel["mean_log1p_rest"]
df_hsc_panel["delta_frac"] = df_hsc_panel["frac_nonzero_HSCs"] - df_hsc_panel["frac_nonzero_rest"]

panel_path = OUT_SUM / "QA_HSCs_marker_panel_vs_rest.csv"
df_hsc_panel.sort_values(["mean_log1p_HSCs", "frac_nonzero_HSCs"], ascending=False).to_csv(panel_path, index=False)

print("\nSaved:", panel_path)
print(df_hsc_panel.sort_values(["mean_log1p_HSCs","frac_nonzero_HSCs"], ascending=False).head(10).to_string(index=False))

# recomendación automática: top2 por "señal" dentro de HSCs (media * frac)
df_hsc_panel["score"] = df_hsc_panel["mean_log1p_HSCs"] * df_hsc_panel["frac_nonzero_HSCs"]
top2 = df_hsc_panel.sort_values("score", ascending=False).head(2)["marker"].tolist()
print("\n[RECOMENDACIÓN] Marcadores HSCs (top2 por score=mean*frac):", top2)

print("\n[OK] QA coherence + HSCs listo.")


PROJECT_ROOT: D:\Users\Coni\Documents\TFM_CirrhosIS
OUT_FILTER : D:\Users\Coni\Documents\TFM_CirrhosIS\data_processed\TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad
MAP_PATH   : D:\Users\Coni\Documents\TFM_CirrhosIS\summary_tables_final\Level2_final_map.json
EMB_PATH   : D:\Users\Coni\Documents\TFM_CirrhosIS\summary_tables_final\UMAP_Harmony_embeddings.csv
[OK] Usando grafo: obsp['harmony_connectivities'] -> (220637, 220637)
K (n_neighbors): 15

Saved: D:\Users\Coni\Documents\TFM_CirrhosIS\summary_tables_final\QA_Level2final_neighbor_purity.csv
Level2_final  n_cells  mean_sameLabel_inNeighbors  median_sameLabel_inNeighbors  p25_sameLabel_inNeighbors  p75_sameLabel_inNeighbors  na_cells
     B_Other      799                    0.635924                      0.785714                   0.266667                   1.000000         0
 B_Activated      598                    0.664103                      0.857143                   0.400000                   0.933333         0
        MAIT     1

  df_comp = emb.groupby("Level2_final", sort=False).apply(compactness).reset_index()



Saved: D:\Users\Coni\Documents\TFM_CirrhosIS\summary_tables_final\QA_HSCs_marker_panel_vs_rest.csv
marker  mean_log1p_HSCs  frac_nonzero_HSCs  mean_log1p_rest  frac_nonzero_rest  delta_mean  delta_frac
TMSB10         2.364775           0.974359         3.300970           0.965594   -0.936195    0.008765
 HMGB2         0.181501           0.307692         0.488691           0.425580   -0.307190   -0.117887
  HOPX         0.124490           0.128205         0.227922           0.145495   -0.103432   -0.017290
  SOX4         0.094076           0.102564         0.118649           0.120835   -0.024573   -0.018271
  TYMP         0.075565           0.102564         0.224821           0.222300   -0.149256   -0.119736
   KIT         0.071109           0.051282         0.003657           0.003622    0.067453    0.047660
  TAL1         0.059300           0.076923         0.000529           0.000598    0.058770    0.076325
 GATA2         0.012471           0.025641         0.000729           0.0005