### 1. Imports + paths del repo + directorios de salida

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import h5py

from src.paths import project_paths

print("Scanpy:", getattr(sc, "__version__", "unknown"))

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

FIG_DIR = FIGURES_DIR / "main"
FIG_DIR.mkdir(parents=True, exist_ok=True)

QA_DIR = RESULTS_DIR / "summary_tables" / "umap_harmony_final"
QA_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR  :", CONFIG_DIR)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIG_DIR     :", FIG_DIR)
print("QA_DIR      :", QA_DIR)

### 2. Leer config + resolver IN_PATH + cargar Level2_final_map.json (Conv_T_other -> CD4_Memory)

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")
CFG = load_simple_yaml(cfg_path)

# Input: salida del NB10 (objeto filtrado RBC-out, sin doublets)
IN_NAME = CFG.get("main_filtered_for_analysis_h5ad_filename", "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad")
IN_PATH = RESULTS_DIR / IN_NAME

# Keys (fallbacks)
PATIENT_KEY        = CFG.get("patient_id_key", "patientID")
DISEASE_KEY        = CFG.get("disease_key", "disease")
LEVEL2_KEY         = CFG.get("level2_key", "Level2")
LEVEL1_REFINED_KEY = CFG.get("level1_refined_key", "Level1_refined")

# PCA key (para lectura vía h5py)
PCA_KEY = CFG.get("pca_key", "X_pca")

# Params Harmony/UMAP
BATCH_KEY     = CFG.get("umap_final_harmony_batch_key", PATIENT_KEY)
N_PCS         = int(CFG.get("harmony_n_pcs", "50"))
N_NEIGHBORS   = int(CFG.get("umap_n_neighbors", "15"))
RANDOM_STATE  = int(CFG.get("umap_random_state", "0"))
MAX_ITER_HMY  = int(CFG.get("harmony_max_iter", "20"))

print("IN_PATH           :", IN_PATH)
print("PATIENT_KEY       :", PATIENT_KEY)
print("DISEASE_KEY       :", DISEASE_KEY)
print("LEVEL1_REFINED_KEY:", LEVEL1_REFINED_KEY)
print("LEVEL2_KEY        :", LEVEL2_KEY)
print("PCA_KEY           :", PCA_KEY)
print("BATCH_KEY (FINAL) :", BATCH_KEY)
print("N_PCS             :", N_PCS)
print("N_NEIGHBORS       :", N_NEIGHBORS)
print("RANDOM_STATE      :", RANDOM_STATE)
print("MAX_ITER_HMY      :", MAX_ITER_HMY)

if not IN_PATH.exists():
    raise FileNotFoundError(f"No existe IN_PATH:\n{IN_PATH}")

# Map Level2_final (de Conv_T_other_cleanup)
candidate_maps = [
    RESULTS_DIR / "summary_tables" / "conv_t_other_cleanup" / "Level2_final_map.json",
    RESULTS_DIR / "summary_tables" / "Level2_final_map.json",
]
MAP_PATH = next((p for p in candidate_maps if p.exists()), None)
if MAP_PATH is None:
    raise FileNotFoundError(
        "No encuentro Level2_final_map.json en ubicaciones esperadas.\nProbé:\n"
        + "\n".join([f"- {x}" for x in candidate_maps])
    )

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_map = json.load(f)

print("MAP_PATH:", MAP_PATH)
print("Level2_final_map.json loaded:", level2_map)

### 3. Lectura mínima vía h5py (obs + X_pca) + construir Level2_final + excluir RBC solo para UMAP

In [None]:
def _read_elem(h5obj):
    """
    Lee un elemento AnnData desde h5py sin materializar /layers.
    Funciona en anndata>=0.8 (rutas pueden variar).
    """
    try:
        from anndata.experimental import read_elem as _re
        return _re(h5obj)
    except Exception:
        try:
            from anndata._io.specs import read_elem as _re
            return _re(h5obj)
        except Exception as e:
            raise ImportError(
                "No puedo importar read_elem (anndata.experimental.read_elem / anndata._io.specs.read_elem). "
                "Necesitas anndata con soporte de lectura de elementos."
            ) from e

# INCLUYE también BATCH_KEY por si umap_final_harmony_batch_key != patientID
needed_obs = sorted(set([PATIENT_KEY, DISEASE_KEY, LEVEL1_REFINED_KEY, LEVEL2_KEY, BATCH_KEY]))

with h5py.File(IN_PATH, "r") as f:
    if "obs" not in f:
        raise KeyError("No existe grupo /obs en el .h5ad (archivo corrupto o no estándar).")

    obs_full = _read_elem(f["obs"])
    if not isinstance(obs_full, pd.DataFrame):
        raise TypeError(f"read_elem(/obs) no devolvió DataFrame. Tipo: {type(obs_full)}")

    missing = [c for c in needed_obs if c not in obs_full.columns]
    if missing:
        raise KeyError(f"Faltan columnas en obs: {missing}")

    obs_full = obs_full.copy()
    obs_full[PATIENT_KEY] = obs_full[PATIENT_KEY].astype(str)
    obs_full[DISEASE_KEY] = obs_full[DISEASE_KEY].astype(str)
    obs_full[LEVEL1_REFINED_KEY] = obs_full[LEVEL1_REFINED_KEY].astype(str)
    obs_full[BATCH_KEY] = obs_full[BATCH_KEY].astype(str)

    # Mantener Level2 como object (sin convertir NaN->"nan")
    l2_obj = obs_full[LEVEL2_KEY].astype("object")

    # Level2_final (mapping Conv_T_other -> CD4_Memory)
    l2_final = l2_obj.replace(level2_map)
    obs_full["Level2_final"] = pd.Categorical(l2_final)

    print("[CHECK] Conv_T_other remaining en Level2_final (debe ser 0):",
          int((obs_full["Level2_final"].astype(str) == "Conv_T_other").sum()))
    print("[CHECK] CD4_Memory count en Level2_final:",
          int((obs_full["Level2_final"].astype(str) == "CD4_Memory").sum()))

    # RBC-out sanity (si aparece, se excluye SOLO para UMAP)
    keep = ~(
        (obs_full[LEVEL1_REFINED_KEY].astype(str) == "RBC") |
        (l2_obj.astype(str) == "RBC") |
        (obs_full["Level2_final"].astype(str) == "RBC")
    )
    n_rbc = int((~keep).sum())
    if n_rbc > 0:
        print(f"[WARN] Aún hay RBC en el objeto ({n_rbc} células). Para UMAP Harmony se excluirán.")

    idx_keep = np.flatnonzero(keep.to_numpy(dtype=bool))
    obs_keep = obs_full.iloc[idx_keep].copy()

    # X_pca
    if "obsm" not in f or PCA_KEY not in f["obsm"]:
        raise KeyError(f"No existe /obsm/{PCA_KEY} en el .h5ad. Necesitas PCA calculado antes.")

    X_pca_ds = f["obsm"][PCA_KEY]
    X_pca = np.asarray(X_pca_ds[idx_keep, :], dtype=np.float32)

print("X_pca shape:", X_pca.shape)
print("n_obs total :", obs_full.shape[0])
print("n_obs keep  :", obs_keep.shape[0])

adata_umap = ad.AnnData(
    X=np.zeros((obs_keep.shape[0], 1), dtype=np.float32),
    obs=obs_keep
)

N_PCS_USED = int(min(N_PCS, X_pca.shape[1]))
adata_umap.obsm["X_pca"] = X_pca[:, :N_PCS_USED].astype(np.float32, copy=False)

print("AnnData mínimo para Harmony:", adata_umap)
print("obs columns:", adata_umap.obs.columns.tolist())
print("X_pca used shape:", adata_umap.obsm["X_pca"].shape)

### 4. Import markers + construir dict Level2→genes + overrides

In [None]:
from src import markers as mk

geneMarkers_level2 = getattr(mk, "geneMarkers_level2", {})
if not geneMarkers_level2:
    try:
        adata_b.file.close()
    except Exception:
        pass
    raise RuntimeError("src/markers.py no expone geneMarkers_level2 (vacío/no definido).")

# base: dict Level2 -> lista genes (agregando sobre linajes)
lvl2_to_symbols = {}
for _l1, subdict in geneMarkers_level2.items():
    if not isinstance(subdict, dict):
        continue
    for _l2, genes in subdict.items():
        if genes:
            lvl2_to_symbols[_l2] = list(genes)

# overrides mínimos para poblaciones problemáticas / nuevas por Level2_final_map
OVERRIDE_2MARKERS = {
    "B_Other":      ["MS4A1", "CD74"],
    "CD4_Memory":   ["IL7R", "CCR7"],     # <- clave tras Conv_T_other -> CD4_Memory
    "ISG_Myeloid":  ["ISG15", "IFIT3"],
    "MonoDC_Other": ["LYZ", "FCER1G"],
    "DC3":          ["CD1C", "S100A8"],   # <- asegurar DC3
    "DC4":          ["FCGR3A", "LST1"],   # <- fallback
    "HSCs":         ["CD34", "KIT"],
    "Plasma":       ["MZB1", "JCHAIN"],
    "pDC":          ["IL3RA", "IRF7"],
}

for l2, genes2 in OVERRIDE_2MARKERS.items():
    if (l2 not in lvl2_to_symbols) or (len([g for g in lvl2_to_symbols.get(l2, []) if g]) < 2):
        lvl2_to_symbols[l2] = genes2

print("Markers dict construido. N Level2 con panel:", len(lvl2_to_symbols))

### 5. Preparar obs + Level2_final + orden por bloques

In [None]:
obs = adata_b.obs.copy()

obs["Level2_final"] = obs[LEVEL2_KEY].astype(str).replace(level2_map).astype(str)
obs["Level2_final"] = pd.Categorical(obs["Level2_final"])

print("[CHECK] Conv_T_other remaining en Level2_final (debe ser 0):",
      int((obs["Level2_final"].astype(str) == "Conv_T_other").sum()))
print("[CHECK] CD4_Memory count en Level2_final:",
      int((obs["Level2_final"].astype(str) == "CD4_Memory").sum()))

# Orden por bloques usando Level2_final
order_by_group = {
    "B":     ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other"],
    "Plasma":["Plasma"],
    "pDC":   ["pDC"],
    "T":     ["CD4_Naive","CD4_Memory","CD8_Naive","CD8_Effector_Cytotoxic","Treg","MAIT","GammaDelta_T","Proliferative_T","Exhausted_T"],
    "NK":    ["NK"],
    "Mono":  ["Classical_Mono","NonClassical_Mono","ISG_Myeloid","MonoDC_Other"],
    "DC":    ["cDC1","cDC2","DC3","DC4","aDC"],  # <- DC3 incluido
    "HSCs":  ["HSCs"],
}

present_l2 = sorted(set(obs["Level2_final"].astype(str).dropna().unique()))

level2_order = []
for g, l2_list in order_by_group.items():
    for l2 in l2_list:
        if l2 in present_l2:
            level2_order.append(l2)

extras = [x for x in present_l2 if x not in level2_order]
level2_order = level2_order + sorted(extras)

def group_of_l2(l2: str) -> str:
    for g, l2_list in order_by_group.items():
        if l2 in l2_list:
            return g
    return "Other"

# Label final para plot: "Grupo | Level2_final"
obs["Level2_plot"] = obs["Level2_final"].astype(str).map(lambda l2: f"{group_of_l2(l2)} | {l2}")
level2_plot_order = [f"{group_of_l2(l2)} | {l2}" for l2 in level2_order]
obs["Level2_plot"] = pd.Categorical(obs["Level2_plot"], categories=level2_plot_order, ordered=True)

print("[CHECK] Level2_final presentes:", len(present_l2))
print("[CHECK] Primeros 30:", present_l2[:30])

### 6. Lista final de genes + symbols -> varnames

In [None]:
# Lista final de genes
gene_symbols = []
for l2 in level2_order:
    gene_symbols.extend(lvl2_to_symbols.get(l2, [])[:2])

# dedup manteniendo orden
seen = set()
gene_symbols = [g for g in gene_symbols if not (g in seen or seen.add(g))]

# mapear symbols->varnames
gene_varnames = mk.symbols_to_varnames(adata_b, gene_symbols)

missing = [s for s, v in zip(gene_symbols, gene_varnames) if v is None]
gene_varnames = [v for v in gene_varnames if v is not None]

print("Markers symbols total:", len(gene_symbols))
print("Markers genes found :", len(gene_varnames))
if missing:
    print("[WARN] Símbolos no encontrados (omitidos):", missing)

if len(gene_varnames) == 0:
    try:
        adata_b.file.close()
    except Exception:
        pass
    raise RuntimeError("No se encontró ningún gen marcador en adata.var_names. Revisa var_names / var['symbol'].")

### 7. Cargar SOLO esos genes a RAM + dotplot FINAL

In [None]:
# Cargar SOLO genes del dotplot a RAM
adata_plot = adata_b[:, gene_varnames].to_memory()

# cerrar el backed
try:
    adata_b.file.close()
except Exception:
    pass

# añadir Level2_plot al objeto pequeño
adata_plot.obs["Level2_plot"] = obs.loc[adata_plot.obs_names, "Level2_plot"].values
adata_plot.obs["Level2_plot"] = pd.Categorical(
    adata_plot.obs["Level2_plot"], categories=level2_plot_order, ordered=True
)

sc.settings.autoshow = False

dp = sc.pl.dotplot(
    adata_plot,
    var_names=gene_varnames,
    groupby="Level2_plot",
    layer=LAYER,
    use_raw=False,
    dendrogram=False,
    standard_scale="var",
    show=False,
)

dp = dp.add_totals().style(dot_edge_color="black", dot_edge_lw=0.5)

out_png = FIG_DIR / "Fig1C_Dotplot_Global_Level2_clean_FINAL.png"
dp.savefig(out_png, dpi=300)
print("Saved:", out_png)

### 8. Guardar tablas auxiliares

In [None]:
totals = (
    obs["Level2_plot"].value_counts()
    .reindex(level2_plot_order)
    .dropna()
    .astype(int)
    .reset_index()
)
totals.columns = ["Level2_plot", "n_cells"]
totals["group"] = totals["Level2_plot"].str.split(" \\| ").str[0]
totals["Level2_final"] = totals["Level2_plot"].str.split(" \\| ").str[1]

totals_path = OUT_SUMMARY / "QA_dotplot_totals_by_Level2_plot_noRBC.csv"
totals.to_csv(totals_path, index=False)
print("Saved:", totals_path)

marker_rows = []
for l2 in level2_order:
    genes = lvl2_to_symbols.get(l2, [])
    m1 = genes[0] if len(genes) > 0 else None
    m2 = genes[1] if len(genes) > 1 else None
    marker_rows.append({"Level2_final": l2, "group": group_of_l2(l2), "marker1": m1, "marker2": m2})

markers_df = pd.DataFrame(marker_rows)
markers_df["Level2_plot"] = markers_df.apply(lambda r: f"{r['group']} | {r['Level2_final']}", axis=1)

markers_path = OUT_SUMMARY / "QA_dotplot_markers_2perLevel2_noRBC.csv"
markers_df.to_csv(markers_path, index=False)
print("Saved:", markers_path)

print("[OK] Dotplot global + tablas QA guardadas.")

### 9. EXTRA QA “Dotplot en números”

In [None]:
import scipy.sparse as sp

# genes usados = los del dotplot
genes_used = list(gene_varnames)

# Filtramos a genes presentes
# aquí adata_plot ya está en memoria, pero genes_used se valida igual
genes_used = [g for g in genes_used if g in adata_plot.var_names]
if len(genes_used) == 0:
    raise RuntimeError("genes_used quedó vacío. Revisa gene_varnames vs var_names.")

# reconstruir Level2_final en el objeto pequeño
# (obs original está en RAM y gene_varnames se definió con el backed)
lvl2_final_small = obs.loc[adata_plot.obs_names, "Level2_final"].astype(str).values
adata_plot.obs["Level2_final"] = lvl2_final_small

# seleccionar matriz desde layer
X = adata_plot.layers[LAYER] if LAYER in adata_plot.layers.keys() else adata_plot.X
if sp.issparse(X):
    X = X.tocsr()

groups = pd.Series(adata_plot.obs["Level2_final"].astype(str).values, index=adata_plot.obs_names)
uniq = sorted(groups.unique())

rows = []
for g in uniq:
    idx = np.where(groups.values == g)[0]
    n = int(idx.size)
    if n == 0:
        continue

    Xg = X[idx, :]

    if sp.issparse(Xg):
        mean = np.asarray(Xg.mean(axis=0)).ravel()
        nnz = np.asarray((Xg > 0).mean(axis=0)).ravel()
    else:
        Xg = np.asarray(Xg)
        mean = np.mean(Xg, axis=0)
        nnz = np.mean((Xg > 0), axis=0)

    row = {"Level2_final": g, "n_cells": n}
    for j, gene in enumerate(genes_used):
        row[f"{gene}__mean_log1p"] = float(mean[j])
        row[f"{gene}__frac_nonzero"] = float(nnz[j])
    rows.append(row)

df_num = pd.DataFrame(rows).sort_values("n_cells", ascending=False)

out_path = OUT_SUMMARY / "QA_dotplot_numeric_matrix_Level2final.csv"
df_num.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Shape:", df_num.shape)
print(df_num.head(8).to_string(index=False))