### 1. Imports + paths del repo

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

from src.paths import project_paths

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

# Inputs/outputs esperados
L2_DIR = RESULTS_DIR / "lineages" / "level2"          # NB07/NB08 outputs
L2_DIR.mkdir(parents=True, exist_ok=True)

SUMMARY_DIR = RESULTS_DIR / "summary_tables"
SUMMARY_DIR.mkdir(parents=True, exist_ok=True)

FIG_MAIN_DIR = FIGURES_DIR / "main"
FIG_MAIN_DIR.mkdir(parents=True, exist_ok=True)

FIG_DOTPLOT_DIR = FIGURES_DIR / "dotplots"
FIG_DOTPLOT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR:", CONFIG_DIR)
print("DATA_DIR:", DATA_DIR)
print("RESULTS_DIR:", RESULTS_DIR)
print("L2_DIR:", L2_DIR)
print("SUMMARY_DIR:", SUMMARY_DIR)
print("FIG_MAIN_DIR:", FIG_MAIN_DIR)
print("FIG_DOTPLOT_DIR:", FIG_DOTPLOT_DIR)

### 2. Leer config + parámetros

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")

CFG = load_simple_yaml(cfg_path)

# Keys
LEVEL1_KEY = CFG.get("level1_key", "Level1")
EMB_KEY    = CFG.get("harmony_emb_key", "X_pca_harmony")
UMAP_KEY   = CFG.get("umap_key", "X_umap_harmony")

MAIN_L1_FILENAME = CFG.get("main_level1_h5ad_filename", "TFM_CIRRHOSIS_main_with_Level1.h5ad")

# Linajes con Level2 detallado (NB07–08)
LINEAGES_WITH_LEVEL2 = ["B", "T_and_NK", "Mono_and_DC"]

print("LEVEL1_KEY:", LEVEL1_KEY)
print("EMB_KEY:", EMB_KEY)
print("UMAP_KEY:", UMAP_KEY)
print("MAIN_L1_FILENAME:", MAIN_L1_FILENAME)
print("LINEAGES_WITH_LEVEL2:", LINEAGES_WITH_LEVEL2)

### 3. Cargar objeto global con Level1 (salida de NB06)

In [None]:
# Ruta: RESULTS_DIR
main_with_l1_path = RESULTS_DIR / MAIN_L1_FILENAME

# Fallbacks
fallbacks = [
    DATA_DIR / MAIN_L1_FILENAME,
    RESULTS_DIR / "objects" / MAIN_L1_FILENAME,
    DATA_DIR / "processed" / MAIN_L1_FILENAME,
]
if not main_with_l1_path.exists():
    for f in fallbacks:
        if f.exists():
            main_with_l1_path = f
            break

if not main_with_l1_path.exists():
    raise FileNotFoundError(
        f"No encuentro el objeto global con Level1.\n"
        f"Probé:\n- {RESULTS_DIR / MAIN_L1_FILENAME}\n"
        + "\n".join([f"- {x}" for x in fallbacks])
    )

adata_main = sc.read_h5ad(main_with_l1_path)
print(adata_main)

if LEVEL1_KEY not in adata_main.obs.columns:
    raise KeyError(f"Falta obs['{LEVEL1_KEY}'] en el objeto global.")

for key in [EMB_KEY, UMAP_KEY]:
    if key not in adata_main.obsm.keys():
        raise KeyError(f"Falta obsm['{key}'] en el objeto global.")
    print(f"obsm['{key}']:", adata_main.obsm[key].shape)

print("\nDistribución Level1:")
print(adata_main.obs[LEVEL1_KEY].value_counts())

### 4. Rutas por linaje + LEVEL2_MAP

In [None]:
level1_categories = (
    list(adata_main.obs[LEVEL1_KEY].cat.categories)
    if hasattr(adata_main.obs[LEVEL1_KEY], "cat")
    else sorted(adata_main.obs[LEVEL1_KEY].astype(str).unique())
)

print("Linajes Level1 presentes:", level1_categories)

# Objetos por linaje con resultados NB08 (L2markers.h5ad)
lineage_paths = {}
print("\nRutas esperadas (solo linajes con Level2):")
for l1 in level1_categories:
    if l1 not in LINEAGES_WITH_LEVEL2:
        print(f"- {l1}: sin Level2 (se queda como población única en Level1).")
        continue

    safe_l1 = l1.replace(" ", "_")
    path_l2 = L2_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2markers.h5ad"
    lineage_paths[l1] = path_l2
    print(f"- {l1}: {path_l2}")

##### LEVEL2_MAP desde config/level2_map.json #####
level2_map_path = CONFIG_DIR / "level2_map.json"
if not level2_map_path.exists():
    raise FileNotFoundError(
        f"Falta {level2_map_path}. Crea el archivo en config/ con el mapeo Level2."
    )

with open(level2_map_path, "r", encoding="utf-8") as f:
    LEVEL2_MAP = json.load(f)

# Sanity check: claves esperadas
missing = [k for k in ["B", "T_and_NK", "Mono_and_DC"] if k not in LEVEL2_MAP]
if missing:
    raise KeyError(f"level2_map.json no contiene las claves esperadas: {missing}")

print("LEVEL2_MAP cargado desde:", level2_map_path)
print("Linajes en LEVEL2_MAP:", list(LEVEL2_MAP.keys()))

### 5. Integración Level2 desde objetos por linaje + Type_L1L2

In [None]:
cell_to_level2 = {}

for l1, path in lineage_paths.items():
    if not path.exists():
        print(f"[AVISO] Falta L2markers para '{l1}': {path} -> ese linaje no aportará Level2.")
        continue

    print(f"\nLeyendo linaje '{l1}' desde:\n  {path}")
    adata_l1 = sc.read_h5ad(path)
    print(adata_l1)

    if LEVEL1_KEY not in adata_l1.obs.columns:
        raise KeyError(f"Falta '{LEVEL1_KEY}' en objeto de linaje '{l1}'.")

    u = adata_l1.obs[LEVEL1_KEY].astype(str).unique()
    if len(u) != 1 or u[0] != l1:
        raise ValueError(f"Objeto por linaje '{l1}' no es puro. {LEVEL1_KEY} únicos: {u}")

    # Columna base de clusters L2
    if "leiden_L2" in adata_l1.obs.columns:
        base_l2_col = "leiden_L2"
    elif "Level2" in adata_l1.obs.columns:
        base_l2_col = "Level2"
    else:
        print(f"[AVISO] '{l1}' no tiene 'leiden_L2' ni 'Level2'. Se omite.")
        continue

    l2_raw = adata_l1.obs[base_l2_col].astype(str)

    mapping = LEVEL2_MAP.get(l1, {})
    if mapping:
        l2_bio = l2_raw.map(mapping).fillna(l2_raw)
    else:
        l2_bio = l2_raw

    adata_l1.obs["Level2_bio"] = l2_bio.astype("category")

    for cell_id, lab in zip(adata_l1.obs_names, adata_l1.obs["Level2_bio"]):
        cell_to_level2[cell_id] = lab

# Serie alineada al objeto global
level2_series = pd.Series(index=adata_main.obs_names, dtype="object")
level2_series.loc[level2_series.index.intersection(cell_to_level2.keys())] = [
    cell_to_level2[c] for c in level2_series.index.intersection(cell_to_level2.keys())
]

adata_main.obs["Level2"] = level2_series.astype("category")

print("\nResumen Level2 (incluye NaN si hay células sin Level2):")
print(adata_main.obs["Level2"].value_counts(dropna=False).head(50))

# Type_L1L2 (Level1__Level2 donde exista Level2)
l1_str = adata_main.obs[LEVEL1_KEY].astype(str)
l2 = adata_main.obs["Level2"]
has_l2 = l2.notna()
type_l1l2 = l1_str.copy()
type_l1l2[has_l2] = l1_str[has_l2] + "__" + l2.astype(str)[has_l2]

adata_main.obs["Type_L1L2"] = type_l1l2.astype("category")

print("\nResumen Type_L1L2 (top):")
print(adata_main.obs["Type_L1L2"].value_counts().head(30))

### 6. UMAP global (figura guardada en figures/main/)

In [None]:
# Variables para colorear
color_vars = [LEVEL1_KEY]
if "Level2" in adata_main.obs.columns:
    color_vars.append("Level2")

for col in ["sample", "sample_id", "libraryID", "disease", "condition", "group", "patientID"]:
    if col in adata_main.obs.columns and col not in color_vars:
        color_vars.append(col)

print("Color vars:", color_vars)

# Guardamos una figura global “multipanel”
to_plot = color_vars[:8]
sc.pl.embedding(
    adata_main,
    basis=UMAP_KEY,
    color=to_plot,
    ncols=2,
    frameon=False,
    show=False,
)

fig_path = FIG_MAIN_DIR / f"umap_global_{'__'.join(to_plot[:4])}.png"
plt.savefig(fig_path, bbox_inches="tight", dpi=200)
plt.close()
print("Figura:", fig_path)

### 7. Dotplot global Level1

In [None]:
from src import markers as mk

geneMarkers_level1 = getattr(mk, "geneMarkers_level1", {})
if not geneMarkers_level1:
    print("[AVISO] geneMarkers_level1 vacío/no definido en src/markers.py -> se omite dotplot Level1.")
else:
    varnames_level1 = {}
    for l1, genes in geneMarkers_level1.items():
        v = mk.symbols_to_varnames(adata_main, genes)
        if v:
            varnames_level1[l1] = v

    if not varnames_level1:
        print("[AVISO] No se pudo mapear ningún gen de Level1 -> se omite dotplot Level1.")
    else:
        dp = sc.pl.dotplot(
            adata_main,
            var_names=varnames_level1,
            groupby=LEVEL1_KEY,
            standard_scale="var",
            dendrogram=True,
            show=False,
        )
        out = FIG_DOTPLOT_DIR / "dotplot_Level1_global.png"
        dp.savefig(out)
        print("Figura:", out)

### 8. Dotplots Level2 por linaje

In [None]:
geneMarkers_level2 = getattr(mk, "geneMarkers_level2", {})

for l1 in LINEAGES_WITH_LEVEL2:
    if l1 not in geneMarkers_level2:
        print(f"[INFO] '{l1}' sin paneles Level2 en src/markers.py -> se omite dotplot L2.")
        continue

    mask = (adata_main.obs[LEVEL1_KEY].astype(str) == l1).to_numpy()
    if mask.sum() == 0:
        print(f"[INFO] '{l1}' no tiene células en el global -> skip.")
        continue

    adata_l1 = adata_main[mask].copy()
    if "Level2" not in adata_l1.obs.columns:
        print(f"[AVISO] '{l1}' no tiene Level2 en global -> skip.")
        continue

    panel_genes = geneMarkers_level2[l1]
    varnames_level2 = {}
    for l2_name, genes in panel_genes.items():
        v = mk.symbols_to_varnames(adata_l1, genes)
        if v:
            varnames_level2[l2_name] = v

    if not varnames_level2:
        print(f"[AVISO] '{l1}': no se pudo mapear ningún gen del panel Level2 -> skip.")
        continue

    dp = sc.pl.dotplot(
        adata_l1,
        var_names=varnames_level2,
        groupby="Level2",
        standard_scale="var",
        dendrogram=True,
        show=False,
    )
    out = FIG_DOTPLOT_DIR / f"dotplot_Level2_{l1.replace(' ', '_')}.png"
    dp.savefig(out)
    print("Figura:", out)

### 9. Tablas resumen (CSV en results/summary_tables/)

In [None]:
# Level1 x Level2
if "Level2" in adata_main.obs.columns:
    ctab_L1_L2 = pd.crosstab(adata_main.obs[LEVEL1_KEY].astype(str), adata_main.obs["Level2"].astype(str), dropna=False)
    out = SUMMARY_DIR / "cell_counts_Level1_by_Level2.csv"
    ctab_L1_L2.to_csv(out)
    print("Guardado:", out)

# Level1 x sample
sample_col = None
for cand in ["sample", "sample_id", "libraryID"]:
    if cand in adata_main.obs.columns:
        sample_col = cand
        break

if sample_col is not None:
    ctab = pd.crosstab(adata_main.obs[LEVEL1_KEY].astype(str), adata_main.obs[sample_col].astype(str))
    out = SUMMARY_DIR / f"cell_counts_Level1_by_{sample_col}.csv"
    ctab.to_csv(out)
    print("Guardado:", out)

# Level1 x disease/condition
disease_col = None
for cand in ["disease", "condition", "group"]:
    if cand in adata_main.obs.columns:
        disease_col = cand
        break

if disease_col is not None:
    ctab = pd.crosstab(adata_main.obs[LEVEL1_KEY].astype(str), adata_main.obs[disease_col].astype(str))
    out = SUMMARY_DIR / f"cell_counts_Level1_by_{disease_col}.csv"
    ctab.to_csv(out)
    print("Guardado:", out)

### 10. Guardar objeto global anotado (en results/)

In [None]:
out_path = RESULTS_DIR / "TFM_CIRRHOSIS_main_annotated.h5ad"
adata_main.write_h5ad(out_path)
print("Guardado:", out_path)

# Mini-checks
print("\nLevel1 top:")
print(adata_main.obs[LEVEL1_KEY].value_counts().head(15))
print("\nLevel2 top (incluye NaN):")
print(adata_main.obs["Level2"].value_counts(dropna=False).head(30))

### 11. QA FINAL

In [None]:
print("\n" + "="*80)
print("QA FINAL — Level2 / Type_L1L2 (post-LEVEL2_MAP)")
print("="*80)

print("\n[0] Resumen básico")
print("  n_obs global:", adata_main.n_obs)
print("  Level1 únicos:", adata_main.obs[LEVEL1_KEY].nunique())
print("  Level2 únicos (incluyendo NaN):", adata_main.obs["Level2"].nunique(dropna=False))
print("  Type_L1L2 únicos:", adata_main.obs["Type_L1L2"].nunique())

print("\n[1] Cobertura Level2")
n_l2 = int(adata_main.obs["Level2"].notna().sum())
print(f"  Células con Level2: {n_l2} / {adata_main.n_obs} ({100*n_l2/adata_main.n_obs:.2f}%)")
print("  NaN Level2:", int(adata_main.obs["Level2"].isna().sum()))

print("\n[2] Labels numéricas en Level2 (mapping incompleto)")
l2_str = adata_main.obs["Level2"].astype(str)
is_numeric = l2_str.str.fullmatch(r"\d+").fillna(False)
n_numeric = int(is_numeric.sum())
print("  Nº celdas con Level2 numérico:", n_numeric)
if n_numeric > 0:
    print("  Ejemplos:")
    print(adata_main.obs.loc[is_numeric, [LEVEL1_KEY, "Level2"]].head(20))

print("\n[3] Crosstab Level1 x Level2 (top por linaje con Level2)")
ct = pd.crosstab(adata_main.obs[LEVEL1_KEY].astype(str), adata_main.obs["Level2"], dropna=False)
for l1 in LINEAGES_WITH_LEVEL2:
    if l1 in ct.index:
        top = ct.loc[l1].sort_values(ascending=False).head(15)
        print(f"\n  -- {l1} --")
        print(top[top > 0])

print("\nQA FINAL completado.")
print("="*80)