### 1. Imports + paths

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import scanpy.external as sce

from src.paths import project_paths

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

L1_DIR = RESULTS_DIR / "lineages" / "level1"
L2_DIR = RESULTS_DIR / "lineages" / "level2"
L2_DIR.mkdir(parents=True, exist_ok=True)

L2_FIG_DIR = FIGURES_DIR / "level2"
L2_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("L1_DIR:", L1_DIR)
print("L2_DIR:", L2_DIR)
print("L2_FIG_DIR:", L2_FIG_DIR)

### 2. Leer config + Parameters

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

def as_float(x: str) -> float:
    return float(x)

def parse_csv_floats(s: str):
    return [float(x.strip()) for x in s.split(",") if x.strip()]

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")

CFG = load_simple_yaml(cfg_path)

# ...... Parameters ...............................
LINEAGE = "B"   # "T_and_NK" | "B" | "Mono_and_DC"
# .................................................

VALID_LINEAGES_WITH_L2 = ["T_and_NK", "B", "Mono_and_DC"]
LINEAGES_WITHOUT_L2    = ["Plasma", "pDC", "RBC", "HSCs"]

if LINEAGE in LINEAGES_WITHOUT_L2:
    raise ValueError(f"'{LINEAGE}' no tiene Level2 (población única).")
if LINEAGE not in VALID_LINEAGES_WITH_L2:
    raise ValueError(f"LINEAGE inválido. Usa: {VALID_LINEAGES_WITH_L2}")

BASE_LAYER = CFG.get("l2_base_layer", "log1p_10k")

N_TOP_HVGS = as_int(CFG.get("l2_hvg_n_top_genes", "3000"))
HVG_FLAVOR = CFG.get("l2_hvg_flavor", "seurat_v3")

N_PCS = as_int(CFG.get("l2_pca_n_comps", "30"))

HARMONY_ENABLE = CFG.get("l2_harmony_enable", "true").lower() == "true"
HARMONY_BATCH_KEY = CFG.get("harmony_batch_key", "gem_id")  # reutiliza tu config global
HARMONY_BASIS = CFG.get("l2_harmony_basis", "X_pca")
HARMONY_ADJ_BASIS = CFG.get("l2_harmony_adjusted_basis", "X_pca_harmony_lineage")

LOCAL_NBR_KEY = CFG.get("l2_neighbors_key", "lineage")
N_NEIGHBORS = as_int(CFG.get("l2_neighbors_n_neighbors", "15"))

LOCAL_UMAP_KEY = CFG.get("l2_umap_key", "X_umap_lineage")
UMAP_MIN_DIST = as_float(CFG.get("l2_umap_min_dist", "0.3"))
UMAP_SPREAD = as_float(CFG.get("l2_umap_spread", "1.0"))
UMAP_RANDOM_STATE = as_int(CFG.get("l2_umap_random_state", "0"))

LEIDEN_RESOLUTIONS = parse_csv_floats(CFG.get("l2_leiden_resolutions", "0.3,0.6,1.0"))
LEIDEN_MAIN_RES = as_float(CFG.get("l2_leiden_main_resolution", "0.6"))
LEIDEN_L2_KEY = CFG.get("l2_leiden_key", "leiden_L2")

ENABLE_LINEAGE_QC = CFG.get("l2_enable_lineage_qc", "false").lower() == "true"
QC_MIN_GENES = as_int(CFG.get("l2_qc_min_genes", "200"))
QC_MIN_COUNTS = as_int(CFG.get("l2_qc_min_counts", "500"))
QC_MAX_PCT_MT = CFG.get("l2_qc_max_pct_mt", "")
QC_MAX_PCT_MT = None if QC_MAX_PCT_MT in ["", "None", "none", "null"] else float(QC_MAX_PCT_MT)

print("LINEAGE:", LINEAGE)
print("HARMONY_ENABLE:", HARMONY_ENABLE, "| batch_key:", HARMONY_BATCH_KEY)
print("LEIDEN_RESOLUTIONS:", LEIDEN_RESOLUTIONS, "| main:", LEIDEN_MAIN_RES)

### 3. Cargar input del linaje (salida del Notebook 06)

In [None]:
safe_l1 = LINEAGE.replace(" ", "_")
input_path = L1_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}.h5ad"

if not input_path.exists():
    raise FileNotFoundError(
        f"No existe el input de linaje en:\n{input_path}\n"
        "Ejecuta el Notebook 06 (split por Level1) antes."
    )

adata = sc.read_h5ad(input_path)
print(adata)

if "Level1" not in adata.obs.columns:
    raise KeyError("Falta obs['Level1'] en el objeto de linaje.")

u = adata.obs["Level1"].unique()
if len(u) != 1 or u[0] != LINEAGE:
    raise ValueError(f"Este archivo no es puro '{LINEAGE}'. Level1 únicos: {u}")

### 4. Matriz base (X = log1p_10k)

In [None]:
if BASE_LAYER in adata.layers:
    adata.X = adata.layers[BASE_LAYER]
else:
    print(f"[AVISO] Falta layers['{BASE_LAYER}']; se usará adata.X tal cual.")

### 5. QC opcional dentro del linaje

In [None]:
if ENABLE_LINEAGE_QC:
    obs = adata.obs
    mask = np.ones(adata.n_obs, dtype=bool)

    if "n_genes_by_counts" in obs.columns:
        mask &= obs["n_genes_by_counts"] >= QC_MIN_GENES
    if "total_counts" in obs.columns:
        mask &= obs["total_counts"] >= QC_MIN_COUNTS
    if QC_MAX_PCT_MT is not None and "pct_counts_mt" in obs.columns:
        mask &= obs["pct_counts_mt"] <= QC_MAX_PCT_MT

    n0 = adata.n_obs
    adata = adata[mask].copy()
    print("QC linaje:", n0, "->", adata.n_obs)
else:
    print("QC linaje desactivado.")

### 6. HVG de linaje

In [None]:
if "counts" not in adata.layers:
    raise KeyError("Falta layers['counts'] (cuentas crudas). Revisar Notebook 03.")

# guarda HVG global si existía
if "highly_variable" in adata.var.columns:
    adata.var["highly_variable_global"] = adata.var["highly_variable"].astype(bool)

n_top = min(N_TOP_HVGS, adata.n_vars)

sc.pp.highly_variable_genes(
    adata,
    flavor=HVG_FLAVOR,
    n_top_genes=n_top,
    layer="counts",
    inplace=True,
)

print("HVG linaje:", int(adata.var["highly_variable"].sum()))

### 7. PCA (solo HVG)

In [None]:
hvg_mask = adata.var["highly_variable"].astype(bool).to_numpy()
adata_hvg = adata[:, hvg_mask].copy()

sc.pp.scale(adata_hvg)
sc.tl.pca(adata_hvg, n_comps=N_PCS, svd_solver="arpack")

adata.obsm["X_pca"] = adata_hvg.obsm["X_pca"].copy()
print("X_pca:", adata.obsm["X_pca"].shape)

### 8. Harmony local

In [None]:
use_rep_for_neighbors = "X_pca"

if HARMONY_ENABLE and (HARMONY_BATCH_KEY in adata.obs.columns) and (adata.obs[HARMONY_BATCH_KEY].nunique() > 1):
    sce.pp.harmony_integrate(
        adata,
        key=HARMONY_BATCH_KEY,
        basis=HARMONY_BASIS,
        adjusted_basis=HARMONY_ADJ_BASIS,
    )
    use_rep_for_neighbors = HARMONY_ADJ_BASIS
    print("Harmony OK ->", HARMONY_ADJ_BASIS, adata.obsm[HARMONY_ADJ_BASIS].shape)
else:
    print("Harmony NO aplicado (desactivado o batch=1 o falta batch_key).")

### 9. Vecinos locales

In [None]:
sc.pp.neighbors(
    adata,
    n_neighbors=N_NEIGHBORS,
    use_rep=use_rep_for_neighbors,
    key_added=LOCAL_NBR_KEY,
)
print("neighbors.use_rep =", use_rep_for_neighbors)

### 10. UMAP local + guardar figura

In [None]:
sc.tl.umap(
    adata,
    neighbors_key=LOCAL_NBR_KEY,
    min_dist=UMAP_MIN_DIST,
    spread=UMAP_SPREAD,
    random_state=UMAP_RANDOM_STATE,
    key_added=LOCAL_UMAP_KEY,
)

fig_path = L2_FIG_DIR / f"umap_{safe_l1}_local.png"
sc.pl.embedding(
    adata,
    basis=LOCAL_UMAP_KEY,
    color=[c for c in ["disease", "leiden_L2"] if c in adata.obs.columns],
    frameon=False,
    show=False,
)
plt.savefig(fig_path, bbox_inches="tight", dpi=200)
plt.close()
print("Figura:", fig_path)

### 11. Leiden L2 (varias resoluciones) + elegir principal

In [None]:
leiden_keys = []
for res in LEIDEN_RESOLUTIONS:
    key = f"leiden_L2_r{str(res).replace('.', '_')}"
    sc.tl.leiden(adata, resolution=res, key_added=key, neighbors_key=LOCAL_NBR_KEY)
    leiden_keys.append(key)

main_key = f"leiden_L2_r{str(LEIDEN_MAIN_RES).replace('.', '_')}"
if main_key not in adata.obs.columns:
    raise KeyError(f"No existe {main_key}. Calculados: {leiden_keys}")

adata.obs[LEIDEN_L2_KEY] = adata.obs[main_key].astype("category")
print("L2 clusters:", adata.obs[LEIDEN_L2_KEY].value_counts().sort_index())

### 12. UMAP coloreado por L2 + guardar figura

In [None]:
fig_path = L2_FIG_DIR / f"umap_{safe_l1}_L2clusters.png"
sc.pl.embedding(
    adata,
    basis=LOCAL_UMAP_KEY,
    color=[LEIDEN_L2_KEY, "disease"] if "disease" in adata.obs.columns else [LEIDEN_L2_KEY],
    frameon=False,
    ncols=2,
    show=False,
)
plt.savefig(fig_path, bbox_inches="tight", dpi=200)
plt.close()
print("Figura:", fig_path)

### 13. Guardar salida del linaje (local, en results/)

In [None]:
adata.uns["L2_params"] = {
    "LINEAGE": LINEAGE,
    "BASE_LAYER": BASE_LAYER,
    "N_TOP_HVGS": N_TOP_HVGS,
    "N_PCS": N_PCS,
    "HARMONY_ENABLE": HARMONY_ENABLE,
    "HARMONY_BATCH_KEY": HARMONY_BATCH_KEY,
    "use_rep_for_neighbors": use_rep_for_neighbors,
    "N_NEIGHBORS": N_NEIGHBORS,
    "UMAP_MIN_DIST": UMAP_MIN_DIST,
    "UMAP_SPREAD": UMAP_SPREAD,
    "UMAP_RANDOM_STATE": UMAP_RANDOM_STATE,
    "LEIDEN_RESOLUTIONS": LEIDEN_RESOLUTIONS,
    "LEIDEN_MAIN_RES": LEIDEN_MAIN_RES,
    "LEIDEN_L2_KEY": LEIDEN_L2_KEY,
}

out_path = L2_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2embedding.h5ad"
adata.write_h5ad(out_path)
print("Guardado:", out_path)