### 1. Imports + paths del repo + directorios de salida

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp

from src.paths import project_paths

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

OUT_DIR = RESULTS_DIR / "summary_tables" / "qa_level2final_coherence_hscs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR  :", CONFIG_DIR)
print("RESULTS_DIR :", RESULTS_DIR)
print("OUT_DIR     :", OUT_DIR)

### 2. Leer config + resolver inputs + keys

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")
CFG = load_simple_yaml(cfg_path)

# Input principal: salida del NB10
OUT_FILTER_NAME = CFG.get("main_filtered_for_analysis_h5ad_filename", "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad")
OUT_FILTER = RESULTS_DIR / OUT_FILTER_NAME

# Keys
PATIENT_KEY        = CFG.get("patient_id_key", "patientID")
DISEASE_KEY        = CFG.get("disease_key", "disease")
LEVEL1_REFINED_KEY = CFG.get("level1_refined_key", "Level1_refined")
LEVEL2_KEY         = CFG.get("level2_key", "Level2")

# Layer para expresión
LAYER = CFG.get("analysis_layer", CFG.get("l2_base_layer", "log1p_10k"))

# Neighbors key (para localizar obsp['{key}_connectivities'])
NEIGHBORS_KEY = CFG.get("neighbors_key", "harmony")

print("OUT_FILTER        :", OUT_FILTER)
print("PATIENT_KEY       :", PATIENT_KEY)
print("DISEASE_KEY       :", DISEASE_KEY)
print("LEVEL1_REFINED_KEY:", LEVEL1_REFINED_KEY)
print("LEVEL2_KEY        :", LEVEL2_KEY)
print("LAYER             :", LAYER)
print("NEIGHBORS_KEY     :", NEIGHBORS_KEY)

if not OUT_FILTER.exists():
    raise FileNotFoundError(f"No existe OUT_FILTER:\n{OUT_FILTER}")

# Level2_final_map.json
candidate_maps = [
    RESULTS_DIR / "summary_tables" / "conv_t_other_cleanup" / "Level2_final_map.json",
    RESULTS_DIR / "summary_tables" / "Level2_final_map.json",
]
MAP_PATH = next((p for p in candidate_maps if p.exists()), None)
if MAP_PATH is None:
    raise FileNotFoundError(
        "No encuentro Level2_final_map.json.\nProbé:\n" + "\n".join([f"- {x}" for x in candidate_maps])
    )

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_map = json.load(f)

print("MAP_PATH:", MAP_PATH)
print("Level2_final_map:", level2_map)

# Embeddings UMAP Harmony (output del NB13)
candidate_emb = [
    RESULTS_DIR / "summary_tables" / "umap_harmony_final" / "UMAP_Harmony_embeddings.csv",
    RESULTS_DIR / "summary_tables" / "UMAP_Harmony_embeddings.csv",
]
EMB_PATH = next((p for p in candidate_emb if p.exists()), None)
if EMB_PATH is None:
    raise FileNotFoundError(
        "No encuentro UMAP_Harmony_embeddings.csv.\nProbé:\n" + "\n".join([f"- {x}" for x in candidate_emb])
    )

print("EMB_PATH:", EMB_PATH)

### 3. Coherencia Level2_final por pureza de vecinos

In [None]:
need = [PATIENT_KEY, DISEASE_KEY, LEVEL1_REFINED_KEY, LEVEL2_KEY]

adata_b = sc.read_h5ad(OUT_FILTER, backed="r")
try:
    missing = [c for c in need if c not in adata_b.obs.columns]
    if missing:
        raise KeyError(f"Faltan columnas obs: {missing}")

    obs = adata_b.obs[need].copy()

    l2_obj = obs[LEVEL2_KEY].astype("object")
    obs["Level2_final"] = l2_obj.replace(level2_map).astype("object")

    # Elegir conectividades
    candidates = [
        f"{NEIGHBORS_KEY}_connectivities",
        "harmony_connectivities",
        "connectivities",
    ]

    graph_name = None
    C = None
    for k in candidates:
        if k in adata_b.obsp.keys():
            graph_name = k
            C = adata_b.obsp[k]
            break

    if C is None:
        raise KeyError(
            "No encuentro connectivities en obsp.\nProbé:\n"
            + "\n".join([f"- obsp['{k}']" for k in candidates])
        )

    print(f"[OK] Usando grafo: obsp['{graph_name}'] -> {C.shape}")

    K = 15
    try:
        if graph_name.endswith("_connectivities"):
            base = graph_name.replace("_connectivities", "")
            uns_key = f"{base}_neighbors"
            if uns_key in adata_b.uns and "params" in adata_b.uns[uns_key]:
                K = int(adata_b.uns[uns_key]["params"].get("n_neighbors", 15))
            elif "neighbors" in adata_b.uns and "params" in adata_b.uns["neighbors"]:
                K = int(adata_b.uns["neighbors"]["params"].get("n_neighbors", 15))
        elif "neighbors" in adata_b.uns and "params" in adata_b.uns["neighbors"]:
            K = int(adata_b.uns["neighbors"]["params"].get("n_neighbors", 15))
    except Exception:
        K = 15
    print("K (n_neighbors):", K)

    def topk_neighbors_from_connectivities(C, k=15):
        if not sp.issparse(C):
            C = sp.csr_matrix(C)
        C = C.tocsr()
        n = C.shape[0]
        neigh_idx = []
        for i in range(n):
            start, end = C.indptr[i], C.indptr[i + 1]
            cols = C.indices[start:end]
            data = C.data[start:end]

            # quitar self
            m = cols != i
            cols = cols[m]
            data = data[m]

            if cols.size == 0:
                neigh_idx.append(np.array([], dtype=int))
                continue

            # limitar a top-k por peso
            if cols.size > k:
                sel = np.argpartition(-data, k - 1)[:k]
                cols = cols[sel]
                data = data[sel]
                ord2 = np.argsort(-data)
                cols = cols[ord2]

            neigh_idx.append(cols.astype(int))
        return neigh_idx

    neigh_idx = topk_neighbors_from_connectivities(C, k=K)

    labels = obs["Level2_final"].astype(str).to_numpy()
    all_levels = pd.Index(sorted(pd.unique(labels)))

    # per-cell same-label fraction
    same_frac = np.full(labels.shape[0], np.nan, dtype=float)
    for i in range(labels.shape[0]):
        nb = neigh_idx[i]
        if nb.size == 0:
            continue
        same_frac[i] = np.mean(labels[nb] == labels[i])

    df_purity = []
    for lvl in all_levels:
        m = labels == lvl
        v = same_frac[m]
        df_purity.append({
            "Level2_final": lvl,
            "n_cells": int(m.sum()),
            "mean_sameLabel_inNeighbors": float(np.nanmean(v)),
            "median_sameLabel_inNeighbors": float(np.nanmedian(v)),
            "p25_sameLabel_inNeighbors": float(np.nanpercentile(v, 25)),
            "p75_sameLabel_inNeighbors": float(np.nanpercentile(v, 75)),
            "na_cells": int(np.isnan(v).sum()),
        })

    df_purity = pd.DataFrame(df_purity).sort_values(
        ["mean_sameLabel_inNeighbors", "n_cells"], ascending=[True, False]
    )

    purity_path = OUT_DIR / "QA_Level2final_neighbor_purity.csv"
    df_purity.to_csv(purity_path, index=False)
    print("Saved:", purity_path)
    print(df_purity.head(10).to_string(index=False))

    # distribución de etiquetas vecinas para HSCs
    hsc_mask = labels == "HSCs"
    if int(hsc_mask.sum()) > 0:
        counts = {}
        idx_hsc = np.where(hsc_mask)[0]
        for i in idx_hsc:
            for j in neigh_idx[i]:
                lab = labels[j]
                counts[lab] = counts.get(lab, 0) + 1

        df_hsc_nb = (
            pd.DataFrame({"neighbor_label": list(counts.keys()), "n_edges": list(counts.values())})
              .sort_values("n_edges", ascending=False)
        )
        df_hsc_nb["share"] = df_hsc_nb["n_edges"] / df_hsc_nb["n_edges"].sum()

        hsc_nb_path = OUT_DIR / "QA_HSCs_neighbor_label_distribution.csv"
        df_hsc_nb.to_csv(hsc_nb_path, index=False)
        print("Saved:", hsc_nb_path)
        print(df_hsc_nb.head(10).to_string(index=False))
    else:
        print("[WARN] No hay HSCs en Level2_final (n=0). Se omite QA_HSCs_neighbor_label_distribution.csv")

finally:
    try:
        adata_b.file.close()
    except Exception:
        pass

### 4. Compactación en UMAP

In [None]:
# Cargar solo columnas necesarias
emb_cols = pd.read_csv(EMB_PATH, nrows=1).columns.tolist()
usecols = ["UMAP1_harmony", "UMAP2_harmony"]

if "Level2_final" in emb_cols:
    usecols = ["Level2_final"] + usecols
elif LEVEL2_KEY in emb_cols:
    usecols = [LEVEL2_KEY] + usecols
else:
    raise KeyError(f"{EMB_PATH.name} no contiene ni 'Level2_final' ni '{LEVEL2_KEY}'.")

emb = pd.read_csv(EMB_PATH, usecols=usecols)

if "Level2_final" not in emb.columns:
    emb["Level2_final"] = emb[LEVEL2_KEY].astype("object").replace(level2_map).astype(str)

def compactness(df):
    x = df[["UMAP1_harmony", "UMAP2_harmony"]].to_numpy(float)
    c = x.mean(axis=0)
    d = np.sqrt(((x - c) ** 2).sum(axis=1))
    return pd.Series({
        "n_cells": int(x.shape[0]),
        "mean_radius": float(d.mean()),
        "median_radius": float(np.median(d)),
        "p75_radius": float(np.percentile(d, 75)),
        "p95_radius": float(np.percentile(d, 95)),
    })

df_comp = emb.groupby("Level2_final", sort=False).apply(compactness).reset_index()

comp_path = OUT_DIR / "QA_Level2final_umap_compactness.csv"
df_comp.to_csv(comp_path, index=False)
print("Saved:", comp_path)
print(df_comp.sort_values("mean_radius", ascending=False).head(10).to_string(index=False))

### 5. Panel numérico HSCs vs resto

In [None]:
# Re-abrir backed para extraer expresión de pocos genes
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")
try:
    if LEVEL2_KEY not in adata_b.obs.columns:
        raise KeyError(f"Falta obs['{LEVEL2_KEY}'] en OUT_FILTER.")
    if "Level2_final" not in adata_b.obs.columns:
        # no persistimos Level2_final en objeto; la reconstruimos por obs
        pass

    # reconstruir labels Level2_final para máscaras sin convertir NaN->"nan" antes de mapear
    l2_obj = adata_b.obs[LEVEL2_KEY].astype("object")
    labels = l2_obj.replace(level2_map).astype("object").astype(str).to_numpy()

    mask_hsc = (labels == "HSCs")
    if int(mask_hsc.sum()) == 0:
        print("[WARN] No hay HSCs -> se omite panel de marcadores HSCs.")
        raise SystemExit(0)

    CANDIDATES = [
        "CD34","KIT",
        "SOX4","SPINK2","AVP",
        "GATA2","LMO2","TAL1","MEIS1","MPL","HOPX",
        "HMGB2","TMSB10","TYMP"
    ]

    def symbol_to_varname(adata, symbol):
        if symbol in adata.var_names:
            return symbol
        if "symbol" in adata.var.columns:
            sym = adata.var["symbol"].astype(str).to_numpy()
            hits = np.where(sym == symbol)[0]
            if hits.size > 0:
                return adata.var_names[hits[0]]
        return None

    varnames = []
    sym_kept = []
    for s in CANDIDATES:
        v = symbol_to_varname(adata_b, s)
        if v is not None:
            varnames.append(v)
            sym_kept.append(s)

    print("HSC panel: símbolos encontrados =", len(sym_kept), "/", len(CANDIDATES))
    if len(sym_kept) == 0:
        raise RuntimeError("No encontré ninguno de los genes candidatos en var_names/var['symbol'].")

    adata_small = adata_b[:, varnames].to_memory()

finally:
    try:
        adata_b.file.close()
    except Exception:
        pass

# matriz a usar
X = adata_small.layers[LAYER] if LAYER in adata_small.layers.keys() else adata_small.X
if not sp.issparse(X):
    X = sp.csr_matrix(X)
X = X.tocsr()

mask_rest = ~mask_hsc

def mean_and_frac(Xsub):
    mean = np.asarray(Xsub.mean(axis=0)).ravel()
    frac = np.asarray((Xsub > 0).mean(axis=0)).ravel()
    return mean, frac

mean_h, frac_h = mean_and_frac(X[mask_hsc])
mean_r, frac_r = mean_and_frac(X[mask_rest])

df_hsc_panel = pd.DataFrame({
    "marker": sym_kept,
    "mean_log1p_HSCs": mean_h,
    "frac_nonzero_HSCs": frac_h,
    "mean_log1p_rest": mean_r,
    "frac_nonzero_rest": frac_r,
})
df_hsc_panel["delta_mean"] = df_hsc_panel["mean_log1p_HSCs"] - df_hsc_panel["mean_log1p_rest"]
df_hsc_panel["delta_frac"] = df_hsc_panel["frac_nonzero_HSCs"] - df_hsc_panel["frac_nonzero_rest"]

panel_path = OUT_DIR / "QA_HSCs_marker_panel_vs_rest.csv"
df_hsc_panel.sort_values(["mean_log1p_HSCs", "frac_nonzero_HSCs"], ascending=False).to_csv(panel_path, index=False)

print("Saved:", panel_path)
print(df_hsc_panel.sort_values(["mean_log1p_HSCs","frac_nonzero_HSCs"], ascending=False).head(10).to_string(index=False))

# recomendación automática: top2 por score = mean * frac (dentro de HSCs)
df_hsc_panel["score"] = df_hsc_panel["mean_log1p_HSCs"] * df_hsc_panel["frac_nonzero_HSCs"]
top2 = df_hsc_panel.sort_values("score", ascending=False).head(2)["marker"].tolist()
print("[RECOMENDACIÓN] Marcadores HSCs (top2 por score=mean*frac):", top2)

print("[OK] QA coherence + HSCs listo.")