In [None]:
# NBXX – Limpieza final, Level2 completo y Level1_refined
# (MISMO NOTEBOOK que funcionaba, con el ÚNICO cambio de eliminar RBC de los outputs)
# + HOTFIX post-NB09: excluir RBC_and_HSC y mapear DC3 como DC en Level1_refined

# ============================================================
# 0) Imports y paths
# ============================================================
from pathlib import Path
import sys
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np

print("Scanpy:", sc.__version__)

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(
        "No pude localizar la raíz del proyecto (no encuentro carpeta 'data_processed' en parents). "
        f"Start={start}"
    )

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
DATA_PROCESSED = PROJECT_ROOT / "data_processed"

IN_PATH = DATA_PROCESSED / "TFM_CIRRHOSIS_main_annotated.h5ad"
OUT_FULL   = DATA_PROCESSED / "TFM_CIRRHOSIS_main_annotated_clean.h5ad"
OUT_FILTER = DATA_PROCESSED / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad"

print("NOTEBOOK_DIR :", NOTEBOOK_DIR)
print("PROJECT_ROOT :", PROJECT_ROOT)
print("IN_PATH      :", IN_PATH)
print("OUT_FULL     :", OUT_FULL)
print("OUT_FILTER   :", OUT_FILTER)

if not IN_PATH.exists():
    raise FileNotFoundError(f"No existe IN_PATH:\n{IN_PATH}")


# ============================================================
# 1) Cargar objeto anotado y revisar estado actual
# ============================================================
adata = sc.read_h5ad(IN_PATH)
print(adata)

print("\nColumnas obs:")
print(adata.obs.columns.tolist())

print("\nDistribución Level1:")
print(adata.obs["Level1"].value_counts())

if "Level2" in adata.obs.columns:
    print("\nDistribución Level2 (top 30):")
    print(adata.obs["Level2"].value_counts().head(30))
else:
    print("\n[AVISO] No existe columna 'Level2' en adata.obs.")

# Comprobar NaN en Level2 por Level1
if "Level2" in adata.obs.columns:
    print("\nConteo de NaN en Level2 por Level1:")
    tmp = adata.obs.copy()
    tmp["is_Level2_NA"] = tmp["Level2"].isna()
    print(tmp.groupby("Level1")["is_Level2_NA"].sum())


# ============================================================
# 2) Completar Level2 para RBC / HSCs / Plasma / pDC
# ============================================================
LEVEL1_TO_LEVEL2_FILL = {
    "RBC": "RBC",
    "HSCs": "HSCs",
    "Plasma": "Plasma",
    "pDC": "pDC",
}

if "Level2" not in adata.obs.columns:
    adata.obs["Level2"] = np.nan

lvl2 = adata.obs["Level2"].astype("object")

for lvl1, lvl2_name in LEVEL1_TO_LEVEL2_FILL.items():
    mask = (adata.obs["Level1"] == lvl1) & (lvl2.isna())
    n_before = mask.sum()
    if n_before > 0:
        print(f"Asignando Level2='{lvl2_name}' a {n_before} células con Level1='{lvl1}' y Level2 NaN")
        lvl2.loc[mask] = lvl2_name
    else:
        print(f"No hay células con Level1='{lvl1}' y Level2 NaN (n=0)")

adata.obs["Level2"] = pd.Categorical(lvl2)

print("\nDistribución Level2 tras rellenar linajes simples:")
print(adata.obs["Level2"].value_counts())

print("NaN total en Level2 (DESPUÉS del fill):", adata.obs["Level2"].isna().sum())
tmp = adata.obs.copy()
tmp["is_Level2_NA"] = tmp["Level2"].isna()
print(tmp.groupby("Level1")["is_Level2_NA"].sum())


# ============================================================
# 3) Marcar poblaciones doublet-like / artefacto
# ============================================================
DOUBLETS_LEVEL2 = [
    "Platelet_like_T",
    "T_NK_doublets",
    "Myeloid_like_T",
]

if not set(DOUBLETS_LEVEL2).issubset(set(adata.obs["Level2"].cat.categories)):
    print("\n[INFO] Alguna categoría de DOUBLETS_LEVEL2 no está en Level2. Revisa la lista si hace falta.")
    print("Categorías Level2 actuales:", adata.obs["Level2"].cat.categories)

adata.obs["doublet_like"] = adata.obs["Level2"].isin(DOUBLETS_LEVEL2)

print("\nConteo de doublet_like=True por Level2 (limpio, sin ceros):")
print(adata.obs.loc[adata.obs["doublet_like"], "Level2"].astype(str).value_counts())

print("\nTotal doublets-like:", int(adata.obs["doublet_like"].sum()))
print("Total células:", adata.n_obs)

# IMPORTANTE: NO filtramos aquí (para no duplicar memoria).
# El filtrado se hace en la celda siguiente (“Crear objeto filtrado…”).


# ============================================================
# 4) Crear objeto filtrado sin doublets y SIN RBC (RBC-out)
# ============================================================
adata_full = adata  # alias explícito por claridad

# HOTFIX post-NB09:
# - excluir RBC y RBC_and_HSC del filtrado (RBC-out robusto)
RBC_LIKE_L1 = ["RBC", "RBC_and_HSC"]

# Filtrado downstream (sin doublets) + RBC-out -> VISTA (view), NO copia
mask_keep = (
    (~adata_full.obs["doublet_like"].to_numpy())
    & (~adata_full.obs["Level1"].astype(str).isin(RBC_LIKE_L1).to_numpy())
)
adata_filt = adata_full[mask_keep]  # <- NO .copy()

print("\n=== Resumen tras filtrar doublets + RBC-out (SIN COPY, no duplica RAM) ===")
print(adata_filt)
print("\nLevel1 (filtrado):")
print(adata_filt.obs["Level1"].value_counts())
print("\nLevel2 (filtrado, top 30):")
print(adata_filt.obs["Level2"].value_counts().head(30))

print("Doublet categories after filter (should be 0):")
print(int(adata_filt.obs["Level2"].isin(DOUBLETS_LEVEL2).sum()))

print("RBC-like presentes en filtrado (debería ser 0):")
print(int(adata_filt.obs["Level1"].astype(str).isin(RBC_LIKE_L1).sum()))


# ============================================================
# 5) Construir Level1_refined (T vs NK, Mono vs DC…)
# ============================================================
# 1) Empezamos con Level1_refined = Level1
adata_filt.obs["Level1_refined"] = adata_filt.obs["Level1"].astype("object")

# Subtipos T dentro de T_and_NK
T_L2 = [
    "CD4_Naive",
    "CD8_Naive",
    "CD8_Effector_Cytotoxic",
    "Conv_T_other",
    "MAIT",
    "GammaDelta_T",
    "Treg",
    "Proliferative_T",
    "Exhausted_T",
]

NK_L2 = ["NK"]

MONO_L2 = [
    "Classical_Mono",
    "NonClassical_Mono",
    "ISG_Myeloid",
    "MonoDC_Other",
]

# HOTFIX post-NB09: incluir DC3 como DC
DC_L2 = [
    "cDC1",
    "cDC2",
    "DC4",
    "aDC",
    "DC3",
]

# Linajes simples que se mantienen tal cual
# (RBC está aquí por consistencia con el histórico, pero ya no está en adata_filt por RBC-out)
SIMPLE_L1 = ["B", "Plasma", "pDC", "RBC", "HSCs"]

# a) Linajes simples
mask_simple = adata_filt.obs["Level1"].isin(SIMPLE_L1)
print(f"[INFO] Linajes simples (B/Plasma/pDC/RBC/HSCs): {mask_simple.sum()} células")

# b) Dentro de T_and_NK: separar T vs NK (usando Level2)
mask_TNK = adata_filt.obs["Level1"] == "T_and_NK"
mask_T = mask_TNK & adata_filt.obs["Level2"].isin(T_L2)
mask_NK = mask_TNK & adata_filt.obs["Level2"].isin(NK_L2)

print(f"[INFO] T subtypes dentro de T_and_NK      → 'T'  : {mask_T.sum()} células")
print(f"[INFO] NK subtypes dentro de T_and_NK     → 'NK' : {mask_NK.sum()} células")

adata_filt.obs.loc[mask_T, "Level1_refined"] = "T"
adata_filt.obs.loc[mask_NK, "Level1_refined"] = "NK"

# c) Dentro de Mono_and_DC: separar Mono vs DC (usando Level2)
mask_MonoDC = adata_filt.obs["Level1"] == "Mono_and_DC"
mask_Mono = mask_MonoDC & adata_filt.obs["Level2"].isin(MONO_L2)
mask_DC   = mask_MonoDC & adata_filt.obs["Level2"].isin(DC_L2)

print(f"[INFO] Mono subtypes dentro de Mono_and_DC → 'Mono': {mask_Mono.sum()} células")
print(f"[INFO] DC subtypes dentro de Mono_and_DC   → 'DC'  : {mask_DC.sum()} células")

adata_filt.obs.loc[mask_Mono, "Level1_refined"] = "Mono"
adata_filt.obs.loc[mask_DC,   "Level1_refined"] = "DC"

print("\n[CHECK] Distribución final de Level1_refined:")
print(adata_filt.obs["Level1_refined"].value_counts())

# HOTFIX: incluir DC3 también en el universo esperado
ALL_KNOWN_L2 = set(
    T_L2 + NK_L2 + MONO_L2 + DC_L2 +
    [
        "B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other",
        "RBC", "HSCs", "Plasma", "pDC",
        "DC3",
    ]
)

l2_present = set(pd.Series(adata_filt.obs["Level2"]).dropna().unique())
unknown_l2 = sorted(l2_present - ALL_KNOWN_L2)

print("\n[CHECK] Level2 presentes en el objeto que NO están en las listas esperadas:")
print(unknown_l2)

bad = adata_filt.obs["Level1_refined"].isin(["T_and_NK", "Mono_and_DC"]).sum()
print("Cells still labeled T_and_NK / Mono_and_DC in Level1_refined:", int(bad))


# ============================================================
# 6) (Opcional) Eliminar columna Type_L1L2
# ============================================================
for col in ["Type_L1L2", "TypeL1L2"]:
    if col in adata_full.obs.columns:
        print(f"Eliminando columna '{col}' de adata_full.obs")
        del adata_full.obs[col]
    if col in adata_filt.obs.columns:
        print(f"Eliminando columna '{col}' de adata_filt.obs")
        del adata_filt.obs[col]


# ============================================================
# 7) Guardar objetos finales (OUT_FULL sin RBC; OUT_FILTER ya sin RBC)
# ============================================================
# Asegurarnos de que las nuevas columnas son categóricas donde tiene sentido
adata_full.obs["Level2"] = adata_full.obs["Level2"].astype("category")
adata_full.obs["doublet_like"] = adata_full.obs["doublet_like"].astype(bool)

# IMPORTANTE RBC-out: quitamos categorías no usadas en el filtrado para que RBC no "aparezca" como categoría vacía
if pd.api.types.is_categorical_dtype(adata_filt.obs["Level2"]):
    adata_filt.obs["Level2"] = adata_filt.obs["Level2"].cat.remove_unused_categories()

adata_filt.obs["Level2"] = adata_filt.obs["Level2"].astype("category")
adata_filt.obs["Level1_refined"] = adata_filt.obs["Level1_refined"].astype("category")
adata_filt.obs["doublet_like"] = adata_filt.obs["doublet_like"].astype(bool)

print("\nGuardando objeto completo (con doublets marcados) en (RBC-out):")
print(" ", OUT_FULL)
adata_full[~adata_full.obs["Level1"].astype(str).isin(RBC_LIKE_L1)].write_h5ad(OUT_FULL, compression="gzip")

print("\nGuardando objeto filtrado (sin doublets, con Level1_refined) en (ya RBC-out):")
print(" ", OUT_FILTER)
adata_filt.write_h5ad(OUT_FILTER, compression="gzip")

print("\n[OK] Guardado completado.")


# ============================================================
# 8) Mini-check final (sanity check rápido)
# ============================================================
print("=== CHECK FINAL ===")
print("Objeto completo (en RAM) :", adata_full)
print("Objeto filtrado (en RAM) :", adata_filt)

print("\nLevel1_refined en adata_filt:")
print(adata_filt.obs["Level1_refined"].value_counts())

print("\nDoublets en adata_full (deberían ser > 0) y en adata_filt (debería ser 0):")
print("doublet_like True en full :", int(adata_full.obs["doublet_like"].sum()))
print("doublet_like True en filt :", int(adata_filt.obs["doublet_like"].sum()))

print("\nRBC-like en adata_filt (debería ser 0):", int((adata_filt.obs["Level1"].astype(str).isin(RBC_LIKE_L1)).sum()))
print("Conv_T_other count (filtrado):", int(adata_filt.obs["Level2"].value_counts().get("Conv_T_other", 0)))
print("DC3 count (filtrado):", int(adata_filt.obs["Level2"].value_counts().get("DC3", 0)))