### 1. Imports + rutas del repo + import de markers.py

In [None]:
from pathlib import Path
import json

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

from src.paths import project_paths
from src import markers

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

INT_RESULTS_DIR = RESULTS_DIR / "integration"
INT_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

ANN_RESULTS_DIR = RESULTS_DIR / "annotation" / "level1"
ANN_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

LINEAGES_DIR = RESULTS_DIR / "lineages" / "level1"
LINEAGES_DIR.mkdir(parents=True, exist_ok=True)

L1_FIG_DIR = FIGURES_DIR / "level1"
L1_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("INT_RESULTS_DIR:", INT_RESULTS_DIR)
print("ANN_RESULTS_DIR:", ANN_RESULTS_DIR)
print("LINEAGES_DIR:", LINEAGES_DIR)
print("L1_FIG_DIR:", L1_FIG_DIR)

### 2. Leer config/config.yaml + cargar LEVEL1_MAP + leer paneles desde markers.py

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}. Debe existir en el repo.")

CFG = load_simple_yaml(cfg_path)

IN_FILENAME  = CFG["neighbors_umap_h5ad_filename"]
OUT_FILENAME = CFG["main_level1_h5ad_filename"]

UMAP_KEY     = CFG.get("umap_key", "X_umap_harmony")
UMAP_BASIS   = UMAP_KEY[2:] if UMAP_KEY.startswith("X_") else UMAP_KEY

LEIDEN_L1_KEY = CFG.get("leiden_l1_key", "leiden_L1")
LEVEL1_KEY    = CFG.get("level1_key", "Level1")

level1_map_path = CONFIG_DIR / CFG["level1_map_json"]
REMOVE_LABELS = [x.strip() for x in CFG.get("level1_remove_labels", "").split(",") if x.strip()]

RANK_KEY = CFG.get("level1_rankgenes_key", "rank_genes_Level1")
RANK_N_GENES = as_int(CFG.get("level1_rank_n_genes", "200"))

if not level1_map_path.exists():
    raise FileNotFoundError(f"Falta {level1_map_path}. Debe existir en el repo.")

LEVEL1_MAP = json.loads(level1_map_path.read_text(encoding="utf-8"))

# Paneles desde markers.py
if not hasattr(markers, "geneMarkers_level1"):
    raise AttributeError("src/markers.py no tiene 'geneMarkers_level1'. Debe existir.")
geneMarkers_level1 = markers.geneMarkers_level1

# Helper desde markers.py (si existe)
HAS_SYMBOLS_TO_VARNAMES = hasattr(markers, "symbols_to_varnames")
print("Markers Level1 cargados desde src/markers.py:", list(geneMarkers_level1.keys()))
print("markers.symbols_to_varnames disponible:", HAS_SYMBOLS_TO_VARNAMES)
print("LEVEL1_MAP entradas:", len(LEVEL1_MAP))

### 3. Cargar input (salida del Notebook 05) + preparar adata.X

In [None]:
in_path = INT_RESULTS_DIR / IN_FILENAME
if not in_path.exists():
    raise FileNotFoundError(
        f"No se encuentra el objeto con neighbors+UMAP en:\n{in_path}\n\n"
        "Ejecuta primero el Notebook 05."
    )

print("Leyendo input:", in_path)
adata = sc.read_h5ad(in_path)
print(adata)

# Matriz para dotplots / DE
if "log1p_10k" in adata.layers:
    adata.X = adata.layers["log1p_10k"]
else:
    print("[AVISO] No existe layers['log1p_10k']; se usará adata.X tal cual.")

if LEIDEN_L1_KEY not in adata.obs.columns:
    raise KeyError(f"Falta obs['{LEIDEN_L1_KEY}']. Revisa Notebook 05.")

# Asegura columna symbol (para mapear por símbolo)
if "symbol" not in adata.var.columns:
    if "features" in adata.var.columns:
        adata.var["symbol"] = adata.var["features"].astype(str)
    else:
        adata.var["symbol"] = adata.var_names.astype(str)

print("\nDistribución de clusters (leiden_L1):")
print(adata.obs[LEIDEN_L1_KEY].value_counts().sort_index())

### 4. Construir varnames_level1 usando markers.py (y fallback simple si no tiene helper)

In [None]:
def fallback_symbols_to_varnames(adata, symbols):
    sym_upper = adata.var["symbol"].astype(str).str.upper()
    wanted = set([s.upper() for s in symbols])
    mask = sym_upper.isin(wanted)
    return adata.var_names[mask].tolist()

varnames_level1 = {}
for l1, genes in geneMarkers_level1.items():
    if HAS_SYMBOLS_TO_VARNAMES:
        present = markers.symbols_to_varnames(adata, genes)
    else:
        present = fallback_symbols_to_varnames(adata, genes)
    varnames_level1[l1] = present

### 5. UMAP overview + guardar figura

In [None]:
color_vars = [LEIDEN_L1_KEY]
for col in ["gem_id", "disease", "total_counts", "n_genes_by_counts", "pct_counts_mt"]:
    if col in adata.obs.columns:
        color_vars.append(col)

sc.pl.embedding(
    adata,
    basis=UMAP_BASIS,
    color=color_vars,
    ncols=3,
    frameon=False,
    show=False,
)
plt.savefig(L1_FIG_DIR / "umap_overview.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada:", L1_FIG_DIR / "umap_overview.png")

### 6. Dotplot de marcadores por cluster + guardar figura

In [None]:
dp = sc.pl.dotplot(
    adata,
    var_names=varnames_level1,
    groupby=LEIDEN_L1_KEY,
    standard_scale="var",
    dendrogram=False,
    show=False,
)
dp.savefig(L1_FIG_DIR / "dotplot_level1_markers_by_cluster.png")
print("Figura guardada:", L1_FIG_DIR / "dotplot_level1_markers_by_cluster.png")

### 7. Autosuggest para rellenar config/level1_map.json

In [None]:
# Score simple por cluster: media de expresión del panel (excluye UTC)
clusters = adata.obs[LEIDEN_L1_KEY].astype(str)
unique_clusters = clusters.unique().tolist()

panel_scores = pd.DataFrame(index=unique_clusters, columns=[k for k in varnames_level1.keys() if k != "UTC"], dtype=float)

for l1 in panel_scores.columns:
    genes = [g for g in varnames_level1[l1] if g in adata.var_names]
    if not genes:
        continue
    for cl in unique_clusters:
        m = (clusters == cl).to_numpy()
        panel_scores.loc[cl, l1] = float(np.asarray(adata[m, genes].X.mean()).ravel()[0])

auto_map = panel_scores.idxmax(axis=1).to_dict()

autosuggest_path = ANN_RESULTS_DIR / "level1_map_autosuggested.json"
autosuggest_path.write_text(json.dumps(auto_map, indent=2, ensure_ascii=False), encoding="utf-8")
print("Mapa sugerido (local) guardado en:", autosuggest_path)

### 8. Aplicar LEVEL1_MAP

In [None]:
clusters_present = sorted(set(adata.obs[LEIDEN_L1_KEY].astype(str).tolist()))
missing = sorted(set(clusters_present) - set(LEVEL1_MAP.keys()))

if missing:
    raise KeyError(
        "LEVEL1_MAP está incompleto.\n"
        f"Clusters sin asignación: {missing}\n\n"
        "Usa la propuesta local y pega el mapa definitivo en config/level1_map.json:\n"
        f"- {ANN_RESULTS_DIR / 'level1_map_autosuggested.json'}"
    )

adata.obs[LEVEL1_KEY] = adata.obs[LEIDEN_L1_KEY].astype(str).map(LEVEL1_MAP).astype("category")

print("\nDistribución Level1 (antes de eliminar labels):")
print(adata.obs[LEVEL1_KEY].value_counts())

### 9. Eliminar labels (Platelets) + UMAP final Level1

In [None]:
if REMOVE_LABELS:
    to_remove = [x for x in REMOVE_LABELS if x in adata.obs[LEVEL1_KEY].cat.categories]
    if to_remove:
        rm_mask = adata.obs[LEVEL1_KEY].isin(to_remove)
        n_rm = int(rm_mask.sum())
        if n_rm > 0:
            adata = adata[~rm_mask].copy()
            adata.obs[LEVEL1_KEY] = adata.obs[LEVEL1_KEY].cat.remove_unused_categories()

print("\nDistribución Level1 (final):")
print(adata.obs[LEVEL1_KEY].value_counts())

sc.pl.embedding(
    adata,
    basis=UMAP_BASIS,
    color=[LEVEL1_KEY],
    frameon=False,
    show=False,
)
plt.savefig(L1_FIG_DIR / "umap_level1_final.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada:", L1_FIG_DIR / "umap_level1_final.png")

### 10. Rank genes por Level1 + figura

In [None]:
print("\nCalculando rank_genes_groups por Level1...")
sc.tl.rank_genes_groups(
    adata,
    groupby=LEVEL1_KEY,
    method="wilcoxon",
    use_raw=False,
    n_genes=RANK_N_GENES,
    pts=True,
    key_added=RANK_KEY,
)

sc.pl.rank_genes_groups(
    adata,
    key=RANK_KEY,
    n_genes=20,
    sharey=False,
    show=False,
)
plt.savefig(L1_FIG_DIR / "rank_genes_level1_top20.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada:", L1_FIG_DIR / "rank_genes_level1_top20.png")

### 11. Exportar CSV de marcadores

In [None]:
def rank_genes_to_long_df(adata, key: str, groupby: str) -> pd.DataFrame:
    rg = adata.uns[key]
    groups = rg["names"].dtype.names
    records = []

    for group in groups:
        names = rg["names"][group]
        scores = rg["scores"][group]
        pvals = rg["pvals"][group]
        pvals_adj = rg["pvals_adj"][group]

        lfc = rg.get("logfoldchanges", None)
        lfc_vals = lfc[group] if lfc is not None else [np.nan] * len(names)

        pts = rg.get("pts", None)
        pct_in = pts[group] if pts is not None else [np.nan] * len(names)

        pts_rest = rg.get("pts_rest", None)
        pct_out = pts_rest[group] if pts_rest is not None else [np.nan] * len(names)

        for rank, (gene, s, pv, pva, lf, pi, po) in enumerate(
            zip(names, scores, pvals, pvals_adj, lfc_vals, pct_in, pct_out), start=1
        ):
            records.append({
                groupby: group,
                "gene": gene,
                "scores": s,
                "pvals": pv,
                "pvals_adj": pva,
                "logfoldchanges": lf,
                "pct_in_group": pi,
                "pct_out_group": po,
                "rank": rank,
            })

    return pd.DataFrame.from_records(records)

markers_L1_df = rank_genes_to_long_df(adata, key=RANK_KEY, groupby=LEVEL1_KEY)
out_csv = ANN_RESULTS_DIR / "TFM_CIRRHOSIS_Level1_markers_rank_genes.csv"
markers_L1_df.to_csv(out_csv, index=False)
print("CSV guardado en:", out_csv)

### 12. Split por Level1

In [None]:
cats = list(adata.obs[LEVEL1_KEY].cat.categories)
print("Linajes Level1 a exportar:", cats)

for l1 in cats:
    mask = (adata.obs[LEVEL1_KEY] == l1).to_numpy()
    n = int(mask.sum())
    if n == 0:
        continue

    safe = l1.replace(" ", "_")
    out_path = LINEAGES_DIR / f"TFM_CIRRHOSIS_Level1_{safe}.h5ad"
    adata_l1 = adata[mask].copy()
    adata_l1.write_h5ad(out_path)

    print(f"- {l1}: {n} células -> {out_path}")

### 13. Guardar objeto principal con Level1

In [None]:
adata.uns["LEVEL1_MAP_used"] = LEVEL1_MAP
out_main = ANN_RESULTS_DIR / OUT_FILENAME
adata.write_h5ad(out_main)

print("\nObjeto principal con Level1 guardado en:")
print(out_main)