### 1. Imports + paths

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

from src.paths import project_paths
from src import markers

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

L2_DIR = RESULTS_DIR / "lineages" / "level2"
L2_DIR.mkdir(parents=True, exist_ok=True)

MARKERS_DIR = RESULTS_DIR / "markers" / "level2"
MARKERS_DIR.mkdir(parents=True, exist_ok=True)

FIG_L2_DIR = FIGURES_DIR / "level2"
FIG_L2_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("L2_DIR:", L2_DIR)
print("MARKERS_DIR:", MARKERS_DIR)
print("FIG_L2_DIR:", FIG_L2_DIR)

### 2. Parameters

In [None]:
# ....... Parameters ..............................
LINEAGE = "B"  # "T_and_NK" | "B" | "Mono_and_DC"
GROUPBY_KEY = "leiden_L2"
BASE_LAYER = "log1p_10k"
RANKGENES_KEY = "rank_genes_leiden_L2"
TOP_N_PLOT = 20
# .................................................

VALID_LINEAGES_WITH_L2 = ["T_and_NK", "B", "Mono_and_DC"]
LINEAGES_WITHOUT_L2 = ["Plasma", "pDC", "RBC", "HSCs"]

if LINEAGE in LINEAGES_WITHOUT_L2:
    raise ValueError(f"'{LINEAGE}' no tiene Level2 (población única).")
if LINEAGE not in VALID_LINEAGES_WITH_L2:
    raise ValueError(f"LINEAGE inválido. Usa: {VALID_LINEAGES_WITH_L2}")

safe_l1 = LINEAGE.replace(" ", "_")
input_path = L2_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2embedding.h5ad"
print("Input:", input_path)

### 3. Cargar objeto L2 (salida del NB07)

In [None]:
if not input_path.exists():
    raise FileNotFoundError(
        f"No existe:\n{input_path}\n"
        "Ejecuta antes el Notebook 07 para este linaje."
    )

adata = sc.read_h5ad(input_path)
print(adata)

if "Level1" not in adata.obs.columns:
    raise KeyError("Falta obs['Level1'].")

u = adata.obs["Level1"].unique()
if len(u) != 1 or u[0] != LINEAGE:
    raise ValueError(f"Este objeto no es puro '{LINEAGE}'. Level1 únicos: {u}")

if GROUPBY_KEY not in adata.obs.columns:
    raise KeyError(f"Falta obs['{GROUPBY_KEY}'] (clustering L2).")

print("\nClusters L2:")
print(adata.obs[GROUPBY_KEY].value_counts().sort_index())

### 4. Matriz base para dotplots/DE

In [None]:
if BASE_LAYER in adata.layers:
    adata.X = adata.layers[BASE_LAYER]
else:
    print(f"[AVISO] Falta layers['{BASE_LAYER}']; se usará adata.X tal cual.")

### 5. Paneles canónicos Level2 + mapping a var_names

In [None]:
geneMarkers_level2 = getattr(markers, "geneMarkers_level2", {})
l2_panels = geneMarkers_level2.get(LINEAGE, {})

print("LINEAGE:", LINEAGE)
print("¿Hay paneles Level2 para este linaje?:", bool(l2_panels))

filtered_panels = {}
if l2_panels:
    # Aseguramos columna 'symbol' si existe 'features' (si no, var_names ya son símbolos)
    if "symbol" not in adata.var.columns:
        if "features" in adata.var.columns:
            adata.var["symbol"] = adata.var["features"].astype(str)
        else:
            adata.var["symbol"] = adata.var_names.astype(str)

    for l2_name, genes in l2_panels.items():
        varnames = markers.symbols_to_varnames(adata, genes)
        if varnames:
            filtered_panels[l2_name] = varnames

print("Paneles filtrados (genes presentes):", list(filtered_panels.keys()))

### 6. Dotplot canónico por cluster L2

In [None]:
if filtered_panels:
    dp = sc.pl.dotplot(
        adata,
        var_names=filtered_panels,
        groupby=GROUPBY_KEY,
        standard_scale="var",
        dendrogram=True,
        show=False,
    )
    fig_path = FIG_L2_DIR / f"dotplot_{safe_l1}_L2_panels.png"
    dp.savefig(fig_path)
    print("Figura:", fig_path)
else:
    print("No hay paneles canónicos mapeables para este linaje; se omite dotplot de paneles.")

### 7. Dotplot extra para T_and_NK (doublets)

In [None]:
if LINEAGE == "T_and_NK":
    extra_genes = ["CD3D", "CD3E", "CD3G", "NCAM1"]
    extra_var = markers.symbols_to_varnames(adata, extra_genes)

    if extra_var:
        dp = sc.pl.dotplot(
            adata,
            var_names=extra_var,
            groupby=GROUPBY_KEY,
            standard_scale="var",
            dendrogram=False,
            show=False,
        )
        fig_path = FIG_L2_DIR / f"dotplot_{safe_l1}_doublet_check.png"
        dp.savefig(fig_path)
        print("Figura:", fig_path)
    else:
        print("[AVISO] No se pudieron mapear CD3*/NCAM1.")

### 8. DE 1-vs-rest por cluster L2 (Wilcoxon) + guardar figura

In [None]:
if LINEAGE == "T_and_NK":
    extra_genes = ["CD3D", "CD3E", "CD3G", "NCAM1"]
    extra_var = markers.symbols_to_varnames(adata, extra_genes)

    if extra_var:
        dp = sc.pl.dotplot(
            adata,
            var_names=extra_var,
            groupby=GROUPBY_KEY,
            standard_scale="var",
            dendrogram=False,
            show=False,
        )
        fig_path = FIG_L2_DIR / f"dotplot_{safe_l1}_doublet_check.png"
        dp.savefig(fig_path)
        print("Figura:", fig_path)
    else:
        print("[AVISO] No se pudieron mapear CD3*/NCAM1.")

### 9. Export a CSV

In [None]:
def rank_genes_to_long_df(adata, key: str, groupby: str) -> pd.DataFrame:
    rg = adata.uns[key]
    groups = rg["names"].dtype.names

    records = []
    for group in groups:
        names = rg["names"][group]
        scores = rg["scores"][group]
        pvals = rg["pvals"][group]
        pvals_adj = rg["pvals_adj"][group]

        logfoldchanges = rg.get("logfoldchanges", None)
        lfc = logfoldchanges[group] if logfoldchanges is not None else [np.nan] * len(names)

        pts = rg.get("pts", None)
        pct_in = pts[group] if pts is not None else [np.nan] * len(names)

        pts_rest = rg.get("pts_rest", None)
        pct_out = pts_rest[group] if pts_rest is not None else [np.nan] * len(names)

        for rank, (gene, s, pv, pva, lf, pi, po) in enumerate(
            zip(names, scores, pvals, pvals_adj, lfc, pct_in, pct_out), start=1
        ):
            records.append({
                "lineage": LINEAGE,
                groupby: group,
                "cluster": group,
                "gene": gene,
                "scores": s,
                "pvals": pv,
                "pvals_adj": pva,
                "logfoldchanges": lf,
                "pct_in_group": pi,
                "pct_out_group": po,
                "rank": rank,
            })

    return pd.DataFrame.from_records(records)

markers_df = rank_genes_to_long_df(adata, key=RANKGENES_KEY, groupby=GROUPBY_KEY)

csv_path = MARKERS_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2markers_rank_genes.csv"
markers_df.to_csv(csv_path, index=False)
print("CSV:", csv_path)

markers_df.head()

### 10. Resumen numérico panel vs cluster

In [None]:
if filtered_panels:
    def summarize_panels(markers_df: pd.DataFrame, panels: dict) -> pd.DataFrame:
        rows = []
        clusters = markers_df["cluster"].unique().tolist()

        # intentar ordenar numéricamente si son "0","1",...
        try:
            clusters = sorted(clusters, key=lambda x: int(x))
        except Exception:
            clusters = sorted(clusters)

        for cl in clusters:
            df_c = markers_df[markers_df["cluster"] == cl]
            for panel_name, genes in panels.items():
                sub = df_c[df_c["gene"].isin(genes)]
                if sub.empty:
                    continue
                rows.append({
                    "cluster": cl,
                    "panel": panel_name,
                    "n_genes_found": sub.shape[0],
                    "mean_lfc": sub["logfoldchanges"].mean(),
                    "mean_pct_in": sub["pct_in_group"].mean(),
                    "max_lfc": sub["logfoldchanges"].max(),
                })
        return pd.DataFrame(rows)

    panel_summary = summarize_panels(markers_df, filtered_panels)
    if not panel_summary.empty:
        best_by_mean_lfc = (
            panel_summary.loc[panel_summary.groupby("cluster")["mean_lfc"].idxmax()]
            .sort_values("cluster")
            .reset_index(drop=True)
        )
        print(best_by_mean_lfc)

        summary_path = MARKERS_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2panel_summary.csv"
        panel_summary.to_csv(summary_path, index=False)
        print("Panel summary CSV:", summary_path)
else:
    print("Sin paneles -> sin resumen panel/cluster.")

### 11. Guardar objeto enriquecido

In [None]:
out_h5ad = L2_DIR / f"TFM_CIRRHOSIS_Level1_{safe_l1}_L2markers.h5ad"
adata.write_h5ad(out_h5ad)
print("h5ad:", out_h5ad)