In [None]:
CELDA 1 — Imports + rutas del repo + outputs

In [None]:
from pathlib import Path
import json
import re
from typing import Optional, List, Dict, Tuple

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

# stats
try:
    from scipy.stats import ttest_ind
    HAVE_SCIPY = True
except Exception:
    HAVE_SCIPY = False

from src.paths import project_paths

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

OUT_SUMMARY = RESULTS_DIR / "summary_tables" / "pseudobulk_de"
OUT_FIG     = FIGURES_DIR / "pseudobulk_de"
OUT_SUMMARY.mkdir(parents=True, exist_ok=True)
OUT_FIG.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIGURES_DIR :", FIGURES_DIR)
print("OUT_SUMMARY :", OUT_SUMMARY)
print("OUT_FIG     :", OUT_FIG)
print("SciPy ttest :", HAVE_SCIPY)


In [None]:
CELDA 2 — Localizar input h5ad + Level2_final_map.json (robusto) + parámetros

In [None]:
# -----------------------------
# Inputs (robusto a legado)
# -----------------------------
IN_PATH_CANDIDATES = [
    RESULTS_DIR / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad",
    RESULTS_DIR / "main" / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad",
    PROJECT_ROOT / "data_processed" / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad",  # legacy
]
IN_PATH = next((p for p in IN_PATH_CANDIDATES if p.exists()), None)
print("IN_PATH:", IN_PATH)
if IN_PATH is None:
    raise FileNotFoundError("No encuentro el .h5ad filtrado. Probé:\n" + "\n".join([f"- {x}" for x in IN_PATH_CANDIDATES]))

MAP_PATH_CANDIDATES = [
    RESULTS_DIR / "summary_tables_final" / "Level2_final_map.json",      # legacy
    RESULTS_DIR / "summary_tables" / "Level2_final_map.json",
    PROJECT_ROOT / "summary_tables_final" / "Level2_final_map.json",     # legacy (fuera de results)
]
MAP_PATH = next((p for p in MAP_PATH_CANDIDATES if p.exists()), None)
print("MAP_PATH:", MAP_PATH)

# -----------------------------
# Parámetros
# -----------------------------
LAYER = "log1p_10k"

# Modo:
ANALYSIS_LEVEL = "Level2_final"   # "Level2_final" (recomendado) o "Level1_refined"

# Etiquetas disease (deben existir tal cual en obs['disease'])
CONTROL_LABEL = "Healthy"
CASE_LABEL    = "Cirrhosis"

# Targets a correr (si ANALYSIS_LEVEL == "Level2_final")
LEVEL2_FINAL_TO_RUN = [
    # B / Plasma
    "B_Naive", "B_Memory", "B_Activated", "B_Atypical", "Plasma",
    # T
    "CD4_Naive", "CD4_Memory", "CD8_Naive", "CD8_Effector_Cytotoxic", "Treg", "MAIT", "GammaDelta_T",
    # NK
    "NK",
    # Mono / DC
    "Classical_Mono", "NonClassical_Mono", "ISG_Myeloid", "MonoDC_Other",
    "cDC1", "cDC2", "DC3", "DC4", "aDC",
    # pDC / HSCs (si existen y quieres)
    "pDC", "HSCs",
]

# Targets legacy (si ANALYSIS_LEVEL == "Level1_refined")
LEVEL1_TO_RUN = ["T", "Mono", "NK", "B", "DC"]

# Genes
GENE_MODE = "HVG"     # "HVG" recomendado
MAX_GENES = 2000
GENE_CHUNK = 200

# Estadística / umbrales
PSEUDOCOUNT = 1e-6
ALPHA_FDR = 0.05

# Filtrado pacientes por target
MIN_CELLS_PER_PATIENT = 20
MIN_PATIENTS_PER_GROUP = 4
MIN_PATIENTS_PER_TARGET = 6

# RBC-out robusto
EXCLUDE_LEVEL1REFINED = {"RBC"}
EXCLUDE_LEVEL2 = {"RBC"}

# Volcano “FOR_FIGURE”
MIN_ABS_LOG2FC = 0.5
MIN_MEAN_LOG1P = 0.20
MIN_FRAC_PATIENTS = 0.50
TOP_N_LABEL = 20

# Blacklist opcional para linfoides (solo afecta highlights/labels)
AMBIENT_MYELOID = {"S100A8","S100A9","S100A12","LYZ","LST1","TREM1","CXCL8"}
APPLY_BLACKLIST_FOR_GROUPS = {"T", "NK", "B"}  # linfoides

FC_COL = "log2FC_Healthy_vs_Cirrhosis"


In [None]:
CELDA 3 — Helpers (FDR, tags, genes, volcán, grupos Level2)

In [None]:
def bh_fdr(pvals: np.ndarray) -> np.ndarray:
    p = np.asarray(pvals, dtype=float)
    n = p.size
    order = np.argsort(p)
    ranked = p[order]
    q = ranked * n / (np.arange(1, n + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    out = np.empty_like(q)
    out[order] = np.clip(q, 0, 1)
    return out

def safe_tag(name: str) -> str:
    s = str(name).strip().replace(" ", "_")
    s = re.sub(r"[^A-Za-z0-9_\-\.]+", "_", s)
    s = re.sub(r"_+", "_", s)
    return s

def get_gene_list(adata_b) -> List[str]:
    if GENE_MODE == "HVG" and "highly_variable" in adata_b.var.columns:
        hv = adata_b.var["highly_variable"].values.astype(bool)
        genes = adata_b.var_names[hv].tolist()
        if len(genes) == 0:
            print("[WARN] highly_variable existe pero vacío; uso primeras MAX_GENES.")
            genes = adata_b.var_names[:MAX_GENES].tolist()
        else:
            genes = genes[:MAX_GENES]
        return genes
    return adata_b.var_names[:MAX_GENES].tolist()

def ensure_fc_col(df: pd.DataFrame, where: str = "") -> pd.DataFrame:
    if FC_COL in df.columns:
        return df
    candidates = []
    for c in df.columns:
        cl = str(c).lower()
        if "log2fc" in cl or "logfc" in cl or ("fold" in cl and "log" in cl):
            candidates.append(c)
    if len(candidates) == 1:
        print(f"[WARN] {where}: renombrando columna '{candidates[0]}' -> '{FC_COL}'")
        return df.rename(columns={candidates[0]: FC_COL})
    raise KeyError(f"[{where}] No encuentro '{FC_COL}'. Columnas={df.columns.tolist()}")

def volcano_plot_base(df_de: pd.DataFrame, out_png: Path, title: str, alpha_fdr: float = 0.05):
    df_de = ensure_fc_col(df_de, where=f"volcano_plot_base({out_png.name})")
    x = df_de[FC_COL].to_numpy(dtype=float)
    q = df_de["FDR"].to_numpy(dtype=float)
    y = -np.log10(np.clip(q, 1e-300, 1.0))

    fig, ax = plt.subplots(figsize=(7.5, 5.5))
    ax.scatter(x, y, s=10, alpha=0.8)
    ax.axhline(-np.log10(alpha_fdr), linestyle="--", linewidth=1)
    ax.axvline(0, linestyle=":", linewidth=1)
    ax.set_xlabel("log2FC (Healthy vs Cirrhosis) [pseudobulk per patient]")
    ax.set_ylabel("-log10(FDR)")
    ax.set_title(title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close(fig)

def volcano_for_figure(
    df_all: pd.DataFrame,
    out_png: Path,
    title: str,
    alpha_fdr: float = 0.05,
    highlight_mask: Optional[np.ndarray] = None,
    genes_to_label: Optional[List[str]] = None,
    abs_log2fc_line: Optional[float] = None,
):
    df_all = ensure_fc_col(df_all, where=f"volcano_for_figure({out_png.name})")
    x = df_all[FC_COL].to_numpy(dtype=float)
    q = df_all["FDR"].to_numpy(dtype=float)
    y = -np.log10(np.clip(q, 1e-300, 1.0))

    sig = q < alpha_fdr
    up = sig & (x > 0)
    down = sig & (x < 0)
    ns = ~sig

    cycle = plt.rcParams["axes.prop_cycle"].by_key().get("color", ["C0","C1","C2","C3"])
    col_down = cycle[0 % len(cycle)]
    col_up   = cycle[1 % len(cycle)]
    col_ns   = "0.75"

    fig, ax = plt.subplots(figsize=(7.8, 5.8))
    ax.scatter(x[ns], y[ns], s=10, alpha=0.6, c=col_ns, edgecolors="none", label="Not significant")
    ax.scatter(x[down], y[down], s=12, alpha=0.85, c=col_down, edgecolors="none",
               label=f"FDR<{alpha_fdr} (Higher in Cirrhosis)")
    ax.scatter(x[up], y[up], s=12, alpha=0.85, c=col_up, edgecolors="none",
               label=f"FDR<{alpha_fdr} (Higher in Healthy)")

    if highlight_mask is not None:
        hm = np.asarray(highlight_mask, dtype=bool)
        hm = hm & np.isfinite(x) & np.isfinite(y)
        ax.scatter(x[hm], y[hm], s=26, alpha=0.95, facecolors="none", edgecolors="k", linewidths=0.7,
                   label="Highlighted (filters)")

    ax.axhline(-np.log10(alpha_fdr), linestyle="--", linewidth=1)
    ax.axvline(0, linestyle=":", linewidth=1)
    if abs_log2fc_line is not None and abs_log2fc_line > 0:
        ax.axvline(+abs_log2fc_line, linestyle="--", linewidth=0.8)
        ax.axvline(-abs_log2fc_line, linestyle="--", linewidth=0.8)

    ax.set_xlabel("log2FC (Healthy vs Cirrhosis) [pseudobulk per patient]")
    ax.set_ylabel("-log10(FDR)")
    ax.set_title(title)
    ax.legend(frameon=False, fontsize=8, loc="upper right")

    if genes_to_label:
        df_lab = df_all[df_all["gene"].astype(str).isin([str(g) for g in genes_to_label])].copy()
        df_lab = df_lab.sort_values("FDR", ascending=True)
        offsets = [(6, 6), (6, -10), (-18, 6), (-18, -10)]
        for k, (_, r) in enumerate(df_lab.iterrows()):
            gx = float(r[FC_COL])
            gy = -np.log10(max(float(r["FDR"]), 1e-300))
            ox, oy = offsets[k % len(offsets)]
            ax.annotate(
                str(r["gene"]), (gx, gy),
                textcoords="offset points", xytext=(ox, oy),
                ha="left", fontsize=8,
                arrowprops=dict(arrowstyle="-", lw=0.4, alpha=0.6),
            )

    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close(fig)

# Grupos (para decidir blacklist en highlights/labels)
order_by_group = {
    "B":     ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other"],
    "Plasma":["Plasma"],
    "pDC":   ["pDC"],
    "T":     ["CD4_Naive","CD4_Memory","CD8_Naive","CD8_Effector_Cytotoxic","Treg","MAIT","GammaDelta_T","Proliferative_T","Exhausted_T"],
    "NK":    ["NK"],
    "Mono":  ["Classical_Mono","NonClassical_Mono","ISG_Myeloid","MonoDC_Other"],
    "DC":    ["cDC1","cDC2","DC3","DC4","aDC"],
    "HSCs":  ["HSCs"],
}

def group_of_l2(l2: str) -> str:
    for g, l2_list in order_by_group.items():
        if l2 in l2_list:
            return g
    return "Other"


In [None]:
CELDA 4 — Abrir objeto backed + preparar obs + Level2_final + seleccionar targets/genes

In [None]:
# 0) cargar mapa Level2_final
level2_map = {}
if ANALYSIS_LEVEL == "Level2_final":
    if MAP_PATH is None:
        raise FileNotFoundError(
            "ANALYSIS_LEVEL=Level2_final pero no encuentro Level2_final_map.json.\nProbé:\n"
            + "\n".join([f"- {x}" for x in MAP_PATH_CANDIDATES])
        )
    with open(MAP_PATH, "r", encoding="utf-8") as f:
        level2_map = json.load(f)
    print("Loaded Level2_final_map:", level2_map)

# 1) cargar h5ad backed
adata_b = sc.read_h5ad(IN_PATH, backed="r")
print("Loaded:", adata_b)

required_obs = ["patientID", "disease", "Level1_refined"]
if ANALYSIS_LEVEL == "Level2_final":
    required_obs.append("Level2")

for col in required_obs:
    if col not in adata_b.obs.columns:
        adata_b.file.close()
        raise KeyError(f"Falta columna en obs: {col}")

if LAYER not in adata_b.layers.keys():
    adata_b.file.close()
    raise KeyError(f"No existe layer '{LAYER}' en adata.layers. Layers: {list(adata_b.layers.keys())}")

obs = adata_b.obs[required_obs].copy()
obs["patientID"] = obs["patientID"].astype(str)
obs["disease"]   = obs["disease"].astype(str)
obs["Level1_refined"] = obs["Level1_refined"].astype(str)

if ANALYSIS_LEVEL == "Level2_final":
    l2_obj = obs["Level2"].astype("object")  # NO str antes de mapear (evita NaN->"nan")
    obs["Level2_final"] = l2_obj.replace(level2_map).astype("object")

# RBC-out robusto
valid_mask_global = ~obs["Level1_refined"].isin(EXCLUDE_LEVEL1REFINED)
if ANALYSIS_LEVEL == "Level2_final":
    valid_mask_global = valid_mask_global & (~obs["Level2_final"].astype(str).isin(EXCLUDE_LEVEL2))

# patient meta
patient_meta = (
    obs.loc[valid_mask_global, ["patientID", "disease"]]
       .drop_duplicates(["patientID", "disease"])
       .set_index("patientID")
)
patients = patient_meta.index.tolist()

print("\nPatients:", len(patients))
print(patient_meta["disease"].value_counts())

# genes
genes = get_gene_list(adata_b)
print("\nGenes used:", len(genes), f"(mode={GENE_MODE}, MAX_GENES={MAX_GENES})")

# targets
if ANALYSIS_LEVEL == "Level2_final":
    target_col = "Level2_final"
    targets = list(LEVEL2_FINAL_TO_RUN)
else:
    target_col = "Level1_refined"
    targets = list(LEVEL1_TO_RUN)

present_targets = set(obs.loc[valid_mask_global, target_col].dropna().astype(str).unique().tolist())
missing_targets = [t for t in targets if t not in present_targets]
targets = [t for t in targets if t in present_targets]

print("\nANALYSIS_LEVEL:", ANALYSIS_LEVEL)
print("target_col    :", target_col)
print("Targets presentes:", len(targets))
if missing_targets:
    print("[INFO] Targets no presentes (omitidos):", missing_targets)

# sanity disease labels
disease_levels = sorted(patient_meta["disease"].unique().tolist())
print("\nDisease levels:", disease_levels)
if CONTROL_LABEL not in disease_levels or CASE_LABEL not in disease_levels:
    print("[WARN] CONTROL_LABEL/CASE_LABEL no encontrados tal cual en disease_levels.")
    print("       Ajusta CONTROL_LABEL y CASE_LABEL en parámetros si hace falta.")


In [None]:
CELDA 5 — Pseudobulk + DE + volcano base (por target)

In [None]:
results_summary: List[Tuple[str, int, str]] = []

for target in targets:
    print("\n==============================")
    print(f"{ANALYSIS_LEVEL} target:", target)

    mask_target = valid_mask_global & (obs[target_col].astype(str).values == str(target))

    # index cells por paciente
    idx_by_patient: Dict[str, np.ndarray] = {}
    n_cells_by_patient: Dict[str, int] = {}
    for pid in patients:
        idx = np.where(mask_target.values & (obs["patientID"].values == pid))[0]
        idx_by_patient[pid] = idx
        n_cells_by_patient[pid] = int(idx.size)

    keep_pids = [pid for pid, n in n_cells_by_patient.items() if n >= MIN_CELLS_PER_PATIENT]
    print(f"Pacientes con >={MIN_CELLS_PER_PATIENT} células:", len(keep_pids), "/", len(patients))

    if len(keep_pids) < MIN_PATIENTS_PER_TARGET:
        print("[SKIP] Muy pocos pacientes con suficientes células para este target.")
        results_summary.append((str(target), len(keep_pids), "SKIP_low_n"))
        continue

    # construir pseudobulk mean(log1p) por paciente con chunks de genes
    pb = np.full((len(keep_pids), len(genes)), np.nan, dtype=np.float32)
    diseases: List[str] = []

    for i, pid in enumerate(keep_pids):
        idx_cells = idx_by_patient[pid]
        n = idx_cells.size
        diseases.append(patient_meta.loc[pid, "disease"])

        sums = np.zeros(len(genes), dtype=np.float64)

        for start in range(0, len(genes), GENE_CHUNK):
            gchunk = genes[start:start + GENE_CHUNK]
            view = adata_b[idx_cells, gchunk]
            X = view.layers[LAYER]
            chunk_sum = np.asarray(X.sum(axis=0)).ravel()
            sums[start:start + len(gchunk)] = chunk_sum

        pb[i, :] = (sums / float(n)).astype(np.float32)

    diseases = np.array(diseases, dtype=str)
    keep_pids = np.array(keep_pids, dtype=str)

    df_pb = pd.DataFrame(pb, index=keep_pids, columns=genes)
    df_pb.insert(0, "disease", diseases)

    tag = safe_tag(target)
    out_pb_csv = OUT_SUMMARY / f"pseudobulk_{ANALYSIS_LEVEL}_{tag}_mean_{LAYER}.csv"
    df_pb.to_csv(out_pb_csv)
    print("Saved pseudobulk:", out_pb_csv)

    # DE (paciente-level)
    A_case = (df_pb["disease"].values == CASE_LABEL)      # Cirrhosis
    B_ctrl = (df_pb["disease"].values == CONTROL_LABEL)   # Healthy
    nA, nB = int(A_case.sum()), int(B_ctrl.sum())
    print(f"n patients {CASE_LABEL}:", nA, "|", CONTROL_LABEL, ":", nB)

    if nA < MIN_PATIENTS_PER_GROUP or nB < MIN_PATIENTS_PER_GROUP:
        print("[SKIP] Muy pocos pacientes por grupo para DE en este target.")
        results_summary.append((str(target), len(keep_pids), "SKIP_group_n"))
        continue

    # log2FC en escala lineal aproximada: expm1(mean log1p)
    X_lin = np.expm1(df_pb[genes].values.astype(np.float64))
    meanA = np.nanmean(X_lin[A_case, :], axis=0)  # CASE
    meanB = np.nanmean(X_lin[B_ctrl, :], axis=0)  # CONTROL
    log2fc = np.log2((meanB + PSEUDOCOUNT) / (meanA + PSEUDOCOUNT))  # Healthy vs Cirrhosis

    # pvals: Welch t-test sobre pseudobulk log1p (paciente-level)
    X_log = df_pb[genes].values.astype(np.float64)
    if HAVE_SCIPY:
        _, pvals = ttest_ind(X_log[B_ctrl, :], X_log[A_case, :], axis=0, equal_var=False, nan_policy="omit")
        pvals = np.nan_to_num(pvals, nan=1.0, posinf=1.0, neginf=1.0)
    else:
        pvals = np.ones_like(log2fc, dtype=float)

    fdr = bh_fdr(pvals)

    df_de = pd.DataFrame({
        "gene": genes,
        f"mean_lin_{CASE_LABEL}": meanA,
        f"mean_lin_{CONTROL_LABEL}": meanB,
        FC_COL: log2fc,
        "pval": pvals,
        "FDR": fdr,
    }).sort_values("FDR", ascending=True)

    out_de_csv = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}.csv"
    df_de.to_csv(out_de_csv, index=False)
    print("Saved DE:", out_de_csv)

    out_png = OUT_FIG / f"Volcano_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}.png"
    volcano_plot_base(
        df_de,
        out_png,
        title=f"{ANALYSIS_LEVEL}={target}: {CONTROL_LABEL} vs {CASE_LABEL} (pseudobulk, per patient)",
        alpha_fdr=ALPHA_FDR
    )
    print("Saved volcano (base):", out_png)

    n_sig = int((df_de["FDR"].values < ALPHA_FDR).sum())
    results_summary.append((str(target), len(keep_pids), f"OK_sigFDR<{ALPHA_FDR}:{n_sig}"))

# cerrar backed
adata_b.file.close()

print("\n=== RESUMEN (base) ===")
for row in results_summary:
    print(row)

print("\n[OK] Pseudobulk + DE + volcano (base) terminado.")


In [None]:
CELDA 6 — Volcano “FOR_FIGURE” + CSV filtrado por target (all genes + highlights + labels)

In [None]:
print("\n[SEC6] Generando volcanos FOR_FIGURE (all genes + highlights + TOP labels) ...")

sec6_summary: List[Tuple[str, int, int]] = []
targets_for_figure_generated: List[str] = []

for target in targets:
    tag = safe_tag(target)

    de_path = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}.csv"
    pb_path = OUT_SUMMARY / f"pseudobulk_{ANALYSIS_LEVEL}_{tag}_mean_{LAYER}.csv"

    if not de_path.exists() or not pb_path.exists():
        print("[SKIP] faltan archivos para", target)
        continue

    df_de = pd.read_csv(de_path)
    df_de = ensure_fc_col(df_de, where=str(de_path))

    df_pb = pd.read_csv(pb_path, index_col=0)  # patientID índice
    gene_cols = [c for c in df_pb.columns if c != "disease"]

    X = df_pb[gene_cols].astype(float).values
    mean_log1p = X.mean(axis=0)
    frac_pat = (X > 0.10).mean(axis=0)  # fracción de pacientes con mean_log1p>0.10

    df_f = df_de.merge(
        pd.DataFrame({"gene": gene_cols, "mean_log1p": mean_log1p, "frac_patients": frac_pat}),
        on="gene",
        how="left",
    )
    df_f = ensure_fc_col(df_f, where=f"merge(df_de, df_pb) target={target}")

    # filtros de highlight
    keep = (
        (df_f["FDR"] < ALPHA_FDR) &
        (df_f[FC_COL].abs() >= MIN_ABS_LOG2FC) &
        (df_f["mean_log1p"] >= MIN_MEAN_LOG1P) &
        (df_f["frac_patients"] >= MIN_FRAC_PATIENTS)
    )

    keep2 = keep.copy()

    # blacklist SOLO para highlights/labels, según grupo (linfoides)
    if ANALYSIS_LEVEL == "Level1_refined":
        apply_blacklist = (str(target) in {"T", "NK", "B"})
    else:
        apply_blacklist = (group_of_l2(str(target)) in APPLY_BLACKLIST_FOR_GROUPS)

    if apply_blacklist:
        keep2 = keep2 & (~df_f["gene"].isin(AMBIENT_MYELOID))

    df_plot = df_f.loc[keep2].copy().sort_values("FDR", ascending=True)

    out_csv = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_FIGURE.csv"
    df_plot.to_csv(out_csv, index=False)

    # genes a etiquetar (balanceado up/down por FDR)
    cand = df_f.loc[keep2].copy().sort_values("FDR", ascending=True)
    n_each = TOP_N_LABEL // 2
    cand_up = cand[cand[FC_COL] > 0].head(n_each)
    cand_dn = cand[cand[FC_COL] < 0].head(n_each)

    genes_to_label = pd.concat([cand_up, cand_dn], axis=0)["gene"].astype(str).tolist()
    if len(genes_to_label) < TOP_N_LABEL:
        extra = cand[~cand["gene"].astype(str).isin(genes_to_label)].head(TOP_N_LABEL - len(genes_to_label))
        genes_to_label += extra["gene"].astype(str).tolist()

    out_png = OUT_FIG / f"Volcano_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_FIGURE.png"
    volcano_for_figure(
        df_all=df_f,
        out_png=out_png,
        title=f"{ANALYSIS_LEVEL}={target}: {CONTROL_LABEL} vs {CASE_LABEL} (pseudobulk) — volcano (all genes)",
        alpha_fdr=ALPHA_FDR,
        highlight_mask=keep2.to_numpy(),
        genes_to_label=genes_to_label,
        abs_log2fc_line=MIN_ABS_LOG2FC,
    )

    n_high = int(np.sum(keep2.to_numpy()))
    sec6_summary.append((str(target), n_high, len(genes_to_label)))
    targets_for_figure_generated.append(str(target))

    print(f"{target}: highlights={n_high} | labels={len(genes_to_label)}")
    print("  Saved:", out_png)
    print("  Saved:", out_csv)

print("\n[OK] Volcanos 'FOR_FIGURE' generados.")
print("Targets generados (FOR_FIGURE):", len(targets_for_figure_generated))


In [None]:
CELDA 7 — Tablas agregadas top10 (FOR_FIGURE / FOR_REPORT / FOR_REPORT_LINFO_CLEAN)

In [None]:
print("\n[SEC7] Generando tablas top10 agregadas ...")

HOUSEKEEPING = {
    "GAPDH","ACTB","ACTG1","B2M","MALAT1","EEF1A1","RPLP0","RPSA","TMSB10","FTH1","FTL"
}
def is_bad_gene(g: str) -> bool:
    g = str(g)
    if g in HOUSEKEEPING: return True
    if g.startswith("MT-"): return True
    if g.startswith("RPL") or g.startswith("RPS"): return True
    return False

rows_fig: List[List[object]] = []
rows_rep: List[List[object]] = []
rows_rep_linfo: List[List[object]] = []

targets_iter = list(targets_for_figure_generated) if "targets_for_figure_generated" in globals() else list(targets)

for target in targets_iter:
    tag = safe_tag(target)
    path = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{tag}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_FIGURE.csv"

    if not path.exists():
        continue

    df = pd.read_csv(path)
    if df.shape[0] == 0:
        continue

    df = ensure_fc_col(df, where=str(path))
    df = df.sort_values("FDR", ascending=True)
    df["gene"] = df["gene"].astype(str)

    # FOR_FIGURE: top10 por dirección (dentro de los genes filtrados)
    top_healthy = df[df[FC_COL] > 0].head(10)
    top_cirr    = df[df[FC_COL] < 0].head(10)

    for _, r in top_healthy.iterrows():
        rows_fig.append([str(target), "Higher_in_Healthy", r["gene"], float(r[FC_COL]), float(r["FDR"])])
    for _, r in top_cirr.iterrows():
        rows_fig.append([str(target), "Higher_in_Cirrhosis", r["gene"], float(r[FC_COL]), float(r["FDR"])])

    # FOR_REPORT: quita housekeeping/ribo/MT
    df_rep0 = df.loc[~df["gene"].map(is_bad_gene)].copy()

    top_healthy_r = df_rep0[df_rep0[FC_COL] > 0].head(10)
    top_cirr_r    = df_rep0[df_rep0[FC_COL] < 0].head(10)

    for _, r in top_healthy_r.iterrows():
        rows_rep.append([str(target), "Higher_in_Healthy", r["gene"], float(r[FC_COL]), float(r["FDR"])])
    for _, r in top_cirr_r.iterrows():
        rows_rep.append([str(target), "Higher_in_Cirrhosis", r["gene"], float(r[FC_COL]), float(r["FDR"])])

    # FOR_REPORT_LINFO_CLEAN: además quita “ambient myeloid” en linfoides
    df_rep_l = df_rep0.copy()

    if ANALYSIS_LEVEL == "Level1_refined":
        apply_blacklist = (str(target) in {"T", "NK", "B"})
    else:
        apply_blacklist = (group_of_l2(str(target)) in APPLY_BLACKLIST_FOR_GROUPS)

    if apply_blacklist:
        df_rep_l = df_rep_l[~df_rep_l["gene"].isin(AMBIENT_MYELOID)].copy()

    top_healthy_l = df_rep_l[df_rep_l[FC_COL] > 0].head(10)
    top_cirr_l    = df_rep_l[df_rep_l[FC_COL] < 0].head(10)

    for _, r in top_healthy_l.iterrows():
        rows_rep_linfo.append([str(target), "Higher_in_Healthy", r["gene"], float(r[FC_COL]), float(r["FDR"])])
    for _, r in top_cirr_l.iterrows():
        rows_rep_linfo.append([str(target), "Higher_in_Cirrhosis", r["gene"], float(r[FC_COL]), float(r["FDR"])])

df_fig = pd.DataFrame(rows_fig, columns=[ANALYSIS_LEVEL, "direction", "gene", FC_COL, "FDR"])
df_rep = pd.DataFrame(rows_rep, columns=[ANALYSIS_LEVEL, "direction", "gene", FC_COL, "FDR"])
df_rep_l = pd.DataFrame(rows_rep_linfo, columns=[ANALYSIS_LEVEL, "direction", "gene", FC_COL, "FDR"])

out_fig = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_FIGURE_top10_by_target.csv"
out_rep = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_REPORT_top10_by_target.csv"
out_rep_l = OUT_SUMMARY / f"DE_pseudobulk_{ANALYSIS_LEVEL}_{CONTROL_LABEL}_vs_{CASE_LABEL}_FOR_REPORT_LINFO_CLEAN_top10_by_target.csv"

df_fig.to_csv(out_fig, index=False)
df_rep.to_csv(out_rep, index=False)
df_rep_l.to_csv(out_rep_l, index=False)

print("Saved:", out_fig)
print("Saved:", out_rep)
print("Saved:", out_rep_l)
print("\n[OK] Tablas top10 generadas.")
