### 1. Imports + paths

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import scanpy as sc

from src.paths import project_paths

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Outputs
FIG_DIR = FIGURES_DIR / "dotplots"
FIG_DIR.mkdir(parents=True, exist_ok=True)

OUT_SUMMARY = RESULTS_DIR / "summary_tables" / "dotplot_global_level2_final"
OUT_SUMMARY.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR  :", CONFIG_DIR)
print("DATA_DIR    :", DATA_DIR)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIG_DIR     :", FIG_DIR)
print("OUT_SUMMARY :", OUT_SUMMARY)

### 2. Leer config + resolver rutas de input + Level2_final_map.json

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")
CFG = load_simple_yaml(cfg_path)

# Input principal: salida del NB10 (RBC-out)
OUT_FILTER_NAME = CFG.get("main_filtered_for_analysis_h5ad_filename", "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad")
OUT_FILTER = RESULTS_DIR / OUT_FILTER_NAME

# Keys (por si cambian)
LEVEL2_KEY = CFG.get("level2_key", "Level2")
LEVEL1_KEY = CFG.get("level1_key", "Level1")
LEVEL1_REFINED_KEY = CFG.get("level1_refined_key", "Level1_refined")

# Layer
LAYER = CFG.get("analysis_layer", "log1p_10k")

print("OUT_FILTER        :", OUT_FILTER)
print("LEVEL2_KEY        :", LEVEL2_KEY)
print("LEVEL1_KEY        :", LEVEL1_KEY)
print("LEVEL1_REFINED_KEY:", LEVEL1_REFINED_KEY)
print("LAYER             :", LAYER)

if not OUT_FILTER.exists():
    raise FileNotFoundError(
        f"No existe OUT_FILTER:\n{OUT_FILTER}\n¿Has ejecutado el notebook de limpieza (NB10) y guardado el objeto filtrado?"
    )

# Level2_final_map.json (Conv_T_other -> CD4_Memory), con búsqueda robusta dentro del repo
candidate_maps = [
    RESULTS_DIR / "summary_tables" / "conv_t_other_cleanup" / "Level2_final_map.json",
    RESULTS_DIR / "summary_tables" / "Level2_final_map.json",
    RESULTS_DIR / "summary_tables" / "Conv_T_other_cleanup" / "Level2_final_map.json",
]

MAP_PATH = None
for p in candidate_maps:
    if p.exists():
        MAP_PATH = p
        break

if MAP_PATH is None:
    raise FileNotFoundError(
        "No encuentro Level2_final_map.json en ubicaciones esperadas dentro de results/summary_tables/.\n"
        "Probé:\n" + "\n".join([f"- {x}" for x in candidate_maps])
    )

with open(MAP_PATH, "r", encoding="utf-8") as f:
    level2_map = json.load(f)

print("MAP_PATH:", MAP_PATH)
print("Level2_final_map.json loaded:")
print(level2_map)

### 3. Abrir OUT_FILTER en backed + checks mínimos

In [None]:
adata_b = sc.read_h5ad(OUT_FILTER, backed="r")
print("Loaded OUT_FILTER (backed):", adata_b)

# checks mínimos
for col in [LEVEL2_KEY, LEVEL1_KEY, LEVEL1_REFINED_KEY]:
    if col not in adata_b.obs.columns:
        try:
            adata_b.file.close()
        except Exception:
            pass
        raise KeyError(f"Falta columna requerida en obs: '{col}'")

if LAYER not in adata_b.layers.keys():
    try:
        adata_b.file.close()
    except Exception:
        pass
    raise KeyError(f"No existe layer '{LAYER}' en el objeto. layers={list(adata_b.layers.keys())}")

if "doublet_like" in adata_b.obs.columns:
    print("doublet_like True (debería ser 0):", int(adata_b.obs["doublet_like"].sum()))

# RBC-out sanity (informativo; debería ser 0)
rbc_l2  = int((adata_b.obs[LEVEL2_KEY].astype(str) == "RBC").sum())
rbc_l1  = int((adata_b.obs[LEVEL1_KEY].astype(str) == "RBC").sum())
rbc_l1r = int((adata_b.obs[LEVEL1_REFINED_KEY].astype(str) == "RBC").sum())
print("RBC counts (Level2/Level1/Level1_refined) deberían ser 0:", (rbc_l2, rbc_l1, rbc_l1r))

### 4. Import markers + construir dict Level2 + overrides

In [None]:
from src import markers as mk

geneMarkers_level2 = getattr(mk, "geneMarkers_level2", {})
if not geneMarkers_level2:
    try:
        adata_b.file.close()
    except Exception:
        pass
    raise RuntimeError("src/markers.py no expone geneMarkers_level2 (vacío/no definido).")

# base: dict Level2 -> lista genes (agregando sobre linajes)
lvl2_to_symbols = {}
for _l1, subdict in geneMarkers_level2.items():
    if not isinstance(subdict, dict):
        continue
    for _l2, genes in subdict.items():
        if genes:
            lvl2_to_symbols[_l2] = list(genes)

# overrides mínimos para poblaciones problemáticas / nuevas por Level2_final_map
# (SIN RBC)
OVERRIDE_2MARKERS = {
    "B_Other":      ["MS4A1", "CD74"],
    "CD4_Memory":   ["IL7R", "CCR7"],     # <- clave tras Conv_T_other -> CD4_Memory
    "ISG_Myeloid":  ["ISG15", "IFIT3"],
    "MonoDC_Other": ["LYZ", "FCER1G"],
    "DC3":          ["CD1C", "S100A8"],   # <- asegurar DC3
    "DC4":          ["FCGR3A", "LST1"],   # <- fallback
    "HSCs":         ["CD34", "KIT"],
    "Plasma":       ["MZB1", "JCHAIN"],
    "pDC":          ["IL3RA", "IRF7"],
}

for l2, genes2 in OVERRIDE_2MARKERS.items():
    if (l2 not in lvl2_to_symbols) or (len([g for g in lvl2_to_symbols.get(l2, []) if g]) < 2):
        lvl2_to_symbols[l2] = genes2

print("Markers dict construido. N Level2 con panel:", len(lvl2_to_symbols))

### 5. Preparar obs + Level2_final + orden por bloques

In [None]:
obs = adata_b.obs.copy()

obs["Level2_final"] = obs[LEVEL2_KEY].astype(str).replace(level2_map).astype(str)
obs["Level2_final"] = pd.Categorical(obs["Level2_final"])

print("[CHECK] Conv_T_other remaining en Level2_final (debe ser 0):",
      int((obs["Level2_final"].astype(str) == "Conv_T_other").sum()))
print("[CHECK] CD4_Memory count en Level2_final:",
      int((obs["Level2_final"].astype(str) == "CD4_Memory").sum()))

# Orden por bloques (SIN RBC) usando Level2_final
order_by_group = {
    "B":     ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other"],
    "Plasma":["Plasma"],
    "pDC":   ["pDC"],
    "T":     ["CD4_Naive","CD4_Memory","CD8_Naive","CD8_Effector_Cytotoxic","Treg","MAIT","GammaDelta_T","Proliferative_T","Exhausted_T"],
    "NK":    ["NK"],
    "Mono":  ["Classical_Mono","NonClassical_Mono","ISG_Myeloid","MonoDC_Other"],
    "DC":    ["cDC1","cDC2","DC3","DC4","aDC"],  # <- DC3 incluido
    "HSCs":  ["HSCs"],
}

present_l2 = sorted(set(obs["Level2_final"].astype(str).dropna().unique()))

level2_order = []
for g, l2_list in order_by_group.items():
    for l2 in l2_list:
        if l2 in present_l2:
            level2_order.append(l2)

extras = [x for x in present_l2 if x not in level2_order]
level2_order = level2_order + sorted(extras)

def group_of_l2(l2: str) -> str:
    for g, l2_list in order_by_group.items():
        if l2 in l2_list:
            return g
    return "Other"

# Label final para plot: "Grupo | Level2_final"
obs["Level2_plot"] = obs["Level2_final"].astype(str).map(lambda l2: f"{group_of_l2(l2)} | {l2}")
level2_plot_order = [f"{group_of_l2(l2)} | {l2}" for l2 in level2_order]
obs["Level2_plot"] = pd.Categorical(obs["Level2_plot"], categories=level2_plot_order, ordered=True)

print("[CHECK] Level2_final presentes:", len(present_l2))
print("[CHECK] Primeros 30:", present_l2[:30])

### 6. Lista final de genes + symbols -> varnames

In [None]:
# Lista final de genes (2 por Level2_final)
gene_symbols = []
for l2 in level2_order:
    gene_symbols.extend(lvl2_to_symbols.get(l2, [])[:2])

# dedup manteniendo orden
seen = set()
gene_symbols = [g for g in gene_symbols if not (g in seen or seen.add(g))]

# mapear symbols->varnames usando helper del repo
gene_varnames = mk.symbols_to_varnames(adata_b, gene_symbols)

missing = [s for s, v in zip(gene_symbols, gene_varnames) if v is None]
gene_varnames = [v for v in gene_varnames if v is not None]

print("Markers symbols total:", len(gene_symbols))
print("Markers genes found :", len(gene_varnames))
if missing:
    print("[WARN] Símbolos no encontrados (omitidos):", missing)

if len(gene_varnames) == 0:
    try:
        adata_b.file.close()
    except Exception:
        pass
    raise RuntimeError("No se encontró ningún gen marcador en adata.var_names. Revisa var_names / var['symbol'].")

### 7. Cargar SOLO esos genes a RAM + dotplot FINAL (figura en figures/dotplots/)

In [None]:
# Cargar SOLO genes del dotplot a RAM
adata_plot = adata_b[:, gene_varnames].to_memory()

# cerrar el backed
try:
    adata_b.file.close()
except Exception:
    pass

# añadir Level2_plot al objeto pequeño
adata_plot.obs["Level2_plot"] = obs.loc[adata_plot.obs_names, "Level2_plot"].values
adata_plot.obs["Level2_plot"] = pd.Categorical(
    adata_plot.obs["Level2_plot"], categories=level2_plot_order, ordered=True
)

sc.settings.autoshow = False

dp = sc.pl.dotplot(
    adata_plot,
    var_names=gene_varnames,
    groupby="Level2_plot",
    layer=LAYER,
    use_raw=False,
    dendrogram=False,
    standard_scale="var",
    show=False,
)

dp = dp.add_totals().style(dot_edge_color="black", dot_edge_lw=0.5)

out_png = FIG_DIR / "Fig1C_Dotplot_Global_Level2_clean_FINAL.png"
dp.savefig(out_png, dpi=300)
print("Saved:", out_png)

### 8. Guardar tablas auxiliares: totales + marcadores usados

In [None]:
totals = (
    obs["Level2_plot"].value_counts()
    .reindex(level2_plot_order)
    .dropna()
    .astype(int)
    .reset_index()
)
totals.columns = ["Level2_plot", "n_cells"]
totals["group"] = totals["Level2_plot"].str.split(" \\| ").str[0]
totals["Level2_final"] = totals["Level2_plot"].str.split(" \\| ").str[1]

totals_path = OUT_SUMMARY / "QA_dotplot_totals_by_Level2_plot_noRBC.csv"
totals.to_csv(totals_path, index=False)
print("Saved:", totals_path)

marker_rows = []
for l2 in level2_order:
    genes = lvl2_to_symbols.get(l2, [])
    m1 = genes[0] if len(genes) > 0 else None
    m2 = genes[1] if len(genes) > 1 else None
    marker_rows.append({"Level2_final": l2, "group": group_of_l2(l2), "marker1": m1, "marker2": m2})

markers_df = pd.DataFrame(marker_rows)
markers_df["Level2_plot"] = markers_df.apply(lambda r: f"{r['group']} | {r['Level2_final']}", axis=1)

markers_path = OUT_SUMMARY / "QA_dotplot_markers_2perLevel2_noRBC.csv"
markers_df.to_csv(markers_path, index=False)
print("Saved:", markers_path)

print("[OK] Dotplot global + tablas QA guardadas.")

### 9. QA “Dotplot en números”: mean_log1p + frac_nonzero por Level2_final

In [None]:
import scipy.sparse as sp

# genes usados = los del dotplot
genes_used = list(gene_varnames)

# (seguridad) filtrar a genes presentes
# aquí adata_plot ya está en memoria, pero genes_used se valida igual
genes_used = [g for g in genes_used if g in adata_plot.var_names]
if len(genes_used) == 0:
    raise RuntimeError("genes_used quedó vacío. Revisa gene_varnames vs var_names.")

# reconstruir Level2_final en el objeto pequeño
# (obs original está en RAM y gene_varnames se definió con el backed)
lvl2_final_small = obs.loc[adata_plot.obs_names, "Level2_final"].astype(str).values
adata_plot.obs["Level2_final"] = lvl2_final_small

# seleccionar matriz desde layer
X = adata_plot.layers[LAYER] if LAYER in adata_plot.layers.keys() else adata_plot.X
if sp.issparse(X):
    X = X.tocsr()

groups = pd.Series(adata_plot.obs["Level2_final"].astype(str).values, index=adata_plot.obs_names)
uniq = sorted(groups.unique())

rows = []
for g in uniq:
    idx = np.where(groups.values == g)[0]
    n = int(idx.size)
    if n == 0:
        continue

    Xg = X[idx, :]

    if sp.issparse(Xg):
        mean = np.asarray(Xg.mean(axis=0)).ravel()
        nnz = np.asarray((Xg > 0).mean(axis=0)).ravel()
    else:
        Xg = np.asarray(Xg)
        mean = np.mean(Xg, axis=0)
        nnz = np.mean((Xg > 0), axis=0)

    row = {"Level2_final": g, "n_cells": n}
    for j, gene in enumerate(genes_used):
        row[f"{gene}__mean_log1p"] = float(mean[j])
        row[f"{gene}__frac_nonzero"] = float(nnz[j])
    rows.append(row)

df_num = pd.DataFrame(rows).sort_values("n_cells", ascending=False)

out_path = OUT_SUMMARY / "QA_dotplot_numeric_matrix_Level2final.csv"
df_num.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Shape:", df_num.shape)
print(df_num.head(8).to_string(index=False))