### 1. Imports + paths

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc

from src.paths import project_paths

print("Scanpy:", sc.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR  :", CONFIG_DIR)
print("DATA_DIR    :", DATA_DIR)
print("RESULTS_DIR :", RESULTS_DIR)
print("FIGURES_DIR :", FIGURES_DIR)


### 2. Leer config + parámetros

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        cfg[k.strip()] = v.strip().strip('"').strip("'")
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}")

CFG = load_simple_yaml(cfg_path)

LEVEL1_KEY = CFG.get("level1_key", "Level1")
LEVEL2_KEY = CFG.get("level2_key", "Level2")  # normalmente "Level2"

# Input: salida de NB09 (global anotado)
MAIN_ANNOTATED_FILENAME = CFG.get("main_annotated_h5ad_filename", "TFM_CIRRHOSIS_main_annotated.h5ad")

# Outputs (RBC-out)
OUT_FULL_NAME   = CFG.get("main_annotated_clean_h5ad_filename", "TFM_CIRRHOSIS_main_annotated_clean.h5ad")
OUT_FILTER_NAME = CFG.get("main_filtered_for_analysis_h5ad_filename", "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad")

IN_PATH    = RESULTS_DIR / MAIN_ANNOTATED_FILENAME
OUT_FULL   = RESULTS_DIR / OUT_FULL_NAME
OUT_FILTER = RESULTS_DIR / OUT_FILTER_NAME

print("LEVEL1_KEY :", LEVEL1_KEY)
print("LEVEL2_KEY :", LEVEL2_KEY)
print("IN_PATH    :", IN_PATH)
print("OUT_FULL   :", OUT_FULL)
print("OUT_FILTER :", OUT_FILTER)

if not IN_PATH.exists():
    raise FileNotFoundError(f"No existe IN_PATH:\n{IN_PATH}")


### 3. Cargar objeto anotado y revisar estado actual

In [None]:
adata = sc.read_h5ad(IN_PATH)
print(adata)

print("\nColumnas obs:")
print(adata.obs.columns.tolist())

if LEVEL1_KEY not in adata.obs.columns:
    raise KeyError(f"Falta obs['{LEVEL1_KEY}'] en el objeto de entrada.")

print("\nDistribución Level1:")
print(adata.obs[LEVEL1_KEY].value_counts())

if LEVEL2_KEY in adata.obs.columns:
    # asegurar categorical para evitar fallos con .cat
    adata.obs[LEVEL2_KEY] = adata.obs[LEVEL2_KEY].astype("category")
    print("\nDistribución Level2 (top 30):")
    print(adata.obs[LEVEL2_KEY].value_counts().head(30))
else:
    print(f"\n[AVISO] No existe columna '{LEVEL2_KEY}' en adata.obs.")

# Comprobar NaN en Level2 por Level1
if LEVEL2_KEY in adata.obs.columns:
    print("\nConteo de NaN en Level2 por Level1:")
    tmp = adata.obs.copy()
    tmp["is_Level2_NA"] = tmp[LEVEL2_KEY].isna()
    print(tmp.groupby(LEVEL1_KEY)["is_Level2_NA"].sum())


### 4. Completar Level2 para HSCs / Plasma / pDC (y RBC si existiera en input)

In [None]:
# Completar Level2 para linajes simples donde a veces queda NaN
LEVEL1_TO_LEVEL2_FILL = {
    "RBC": "RBC",
    "HSCs": "HSCs",
    "Plasma": "Plasma",
    "pDC": "pDC",
}

if LEVEL2_KEY not in adata.obs.columns:
    adata.obs[LEVEL2_KEY] = np.nan

lvl2 = adata.obs[LEVEL2_KEY].astype("object")

for lvl1, lvl2_name in LEVEL1_TO_LEVEL2_FILL.items():
    mask = (adata.obs[LEVEL1_KEY].astype(str) == lvl1) & (pd.isna(lvl2))
    n_before = int(mask.sum())
    if n_before > 0:
        print(f"Asignando {LEVEL2_KEY}='{lvl2_name}' a {n_before} células con {LEVEL1_KEY}='{lvl1}' y {LEVEL2_KEY} NaN")
        lvl2.loc[mask] = lvl2_name
    else:
        print(f"No hay células con {LEVEL1_KEY}='{lvl1}' y {LEVEL2_KEY} NaN (n=0)")

adata.obs[LEVEL2_KEY] = pd.Categorical(lvl2)

print("\nDistribución Level2 tras rellenar linajes simples:")
print(adata.obs[LEVEL2_KEY].value_counts())

print("NaN total en Level2 (DESPUÉS del fill):", int(adata.obs[LEVEL2_KEY].isna().sum()))
tmp = adata.obs.copy()
tmp["is_Level2_NA"] = tmp[LEVEL2_KEY].isna()
print(tmp.groupby(LEVEL1_KEY)["is_Level2_NA"].sum())


### 5. Marcar poblaciones doublet-like / artefacto

In [None]:
DOUBLETS_LEVEL2 = [
    "Platelet_like_T",
    "T_NK_doublets",
    "Myeloid_like_T",
]

# Info si faltan categorías
if pd.api.types.is_categorical_dtype(adata.obs[LEVEL2_KEY]):
    current_cats = set(adata.obs[LEVEL2_KEY].cat.categories.astype(str))
else:
    current_cats = set(adata.obs[LEVEL2_KEY].astype(str).unique())

missing_doublets = [x for x in DOUBLETS_LEVEL2 if x not in current_cats]
if missing_doublets:
    print("\n[INFO] Alguna categoría de DOUBLETS_LEVEL2 no está en Level2. (Puede ser normal según dataset.)")
    print("Faltan:", missing_doublets)

adata.obs["doublet_like"] = adata.obs[LEVEL2_KEY].astype(str).isin(DOUBLETS_LEVEL2)

print("\nConteo de doublet_like=True por Level2 (sin ceros):")
print(adata.obs.loc[adata.obs["doublet_like"], LEVEL2_KEY].astype(str).value_counts())

print("\nTotal doublets-like:", int(adata.obs["doublet_like"].sum()))
print("Total células:", adata.n_obs)


### 6. Crear objeto filtrado sin doublets y SIN RBC + hotfix RBC_and_HSC

In [None]:
adata_full = adata  # alias explícito

# HOTFIX post-NB09:
# - excluir RBC y RBC_and_HSC del filtrado (RBC-out robusto)
RBC_LIKE_L1 = ["RBC", "RBC_and_HSC"]

mask_keep = (
    (~adata_full.obs["doublet_like"].to_numpy())
    & (~adata_full.obs[LEVEL1_KEY].astype(str).isin(RBC_LIKE_L1).to_numpy())
)

# VISTA (view) para no duplicar RAM (igual que el notebook original)
adata_filt = adata_full[mask_keep]

print("\n=== Resumen tras filtrar doublets + RBC-out (SIN COPY) ===")
print(adata_filt)

print("\nLevel1 (filtrado):")
print(adata_filt.obs[LEVEL1_KEY].value_counts())

print("\nLevel2 (filtrado, top 30):")
print(adata_filt.obs[LEVEL2_KEY].value_counts().head(30))

print("Doublet categories after filter (should be 0):")
print(int(adata_filt.obs[LEVEL2_KEY].astype(str).isin(DOUBLETS_LEVEL2).sum()))

print("RBC-like presentes en filtrado (debería ser 0):")
print(int(adata_filt.obs[LEVEL1_KEY].astype(str).isin(RBC_LIKE_L1).sum()))


### 7. Construir Level1_refined

In [None]:
# Empezamos con Level1_refined = Level1
adata_filt.obs["Level1_refined"] = adata_filt.obs[LEVEL1_KEY].astype("object")

T_L2 = [
    "CD4_Naive",
    "CD8_Naive",
    "CD8_Effector_Cytotoxic",
    "Conv_T_other",
    "MAIT",
    "GammaDelta_T",
    "Treg",
    "Proliferative_T",
    "Exhausted_T",
]

NK_L2 = ["NK"]

MONO_L2 = [
    "Classical_Mono",
    "NonClassical_Mono",
    "ISG_Myeloid",
    "MonoDC_Other",
]

# HOTFIX post-NB09: incluir DC3 como DC
DC_L2 = [
    "cDC1",
    "cDC2",
    "DC4",
    "aDC",
    "DC3",
]

# Linajes simples que se mantienen tal cual (RBC puede existir en input, pero ya fue filtrado)
SIMPLE_L1 = ["B", "Plasma", "pDC", "RBC", "HSCs"]

mask_simple = adata_filt.obs[LEVEL1_KEY].astype(str).isin(SIMPLE_L1)
print(f"[INFO] Linajes simples (B/Plasma/pDC/RBC/HSCs): {int(mask_simple.sum())} células")

# T_and_NK -> T vs NK según Level2
mask_TNK = adata_filt.obs[LEVEL1_KEY].astype(str) == "T_and_NK"
mask_T = mask_TNK & adata_filt.obs[LEVEL2_KEY].astype(str).isin(T_L2)
mask_NK = mask_TNK & adata_filt.obs[LEVEL2_KEY].astype(str).isin(NK_L2)

print(f"[INFO] T subtypes dentro de T_and_NK  → 'T'  : {int(mask_T.sum())} células")
print(f"[INFO] NK subtypes dentro de T_and_NK → 'NK' : {int(mask_NK.sum())} células")

adata_filt.obs.loc[mask_T, "Level1_refined"] = "T"
adata_filt.obs.loc[mask_NK, "Level1_refined"] = "NK"

# Mono_and_DC -> Mono vs DC según Level2
mask_MonoDC = adata_filt.obs[LEVEL1_KEY].astype(str) == "Mono_and_DC"
mask_Mono = mask_MonoDC & adata_filt.obs[LEVEL2_KEY].astype(str).isin(MONO_L2)
mask_DC   = mask_MonoDC & adata_filt.obs[LEVEL2_KEY].astype(str).isin(DC_L2)

print(f"[INFO] Mono subtypes dentro de Mono_and_DC → 'Mono': {int(mask_Mono.sum())} células")
print(f"[INFO] DC subtypes dentro de Mono_and_DC   → 'DC'  : {int(mask_DC.sum())} células")

adata_filt.obs.loc[mask_Mono, "Level1_refined"] = "Mono"
adata_filt.obs.loc[mask_DC,   "Level1_refined"] = "DC"

print("\n[CHECK] Distribución final de Level1_refined:")
print(adata_filt.obs["Level1_refined"].value_counts())

ALL_KNOWN_L2 = set(
    T_L2 + NK_L2 + MONO_L2 + DC_L2 +
    ["B_Naive", "B_Memory", "B_Activated", "B_Atypical", "B_Other", "RBC", "HSCs", "Plasma", "pDC", "DC3"]
)

l2_present = set(pd.Series(adata_filt.obs[LEVEL2_KEY]).dropna().astype(str).unique())
unknown_l2 = sorted(l2_present - ALL_KNOWN_L2)

print("\n[CHECK] Level2 presentes en el objeto que NO están en las listas esperadas:")
print(unknown_l2)

bad = int(adata_filt.obs["Level1_refined"].astype(str).isin(["T_and_NK", "Mono_and_DC"]).sum())
print("Cells still labeled T_and_NK / Mono_and_DC in Level1_refined:", bad)


### 8. Eliminar columna Type_L1L2

In [None]:
for col in ["Type_L1L2", "TypeL1L2"]:
    if col in adata_full.obs.columns:
        print(f"Eliminando columna '{col}' de adata_full.obs")
        del adata_full.obs[col]
    if col in adata_filt.obs.columns:
        print(f"Eliminando columna '{col}' de adata_filt.obs")
        del adata_filt.obs[col]


### 9. Guardar objetos finales en results/

In [None]:
# Dtypes recomendados
adata_full.obs[LEVEL2_KEY] = adata_full.obs[LEVEL2_KEY].astype("category")
adata_full.obs["doublet_like"] = adata_full.obs["doublet_like"].astype(bool)

# IMPORTANTE RBC-out: limpiar categorías no usadas en el filtrado para que RBC no aparezca como categoría vacía
if pd.api.types.is_categorical_dtype(adata_filt.obs[LEVEL2_KEY]):
    adata_filt.obs[LEVEL2_KEY] = adata_filt.obs[LEVEL2_KEY].cat.remove_unused_categories()

adata_filt.obs[LEVEL2_KEY] = adata_filt.obs[LEVEL2_KEY].astype("category")
adata_filt.obs["Level1_refined"] = adata_filt.obs["Level1_refined"].astype("category")
adata_filt.obs["doublet_like"] = adata_filt.obs["doublet_like"].astype(bool)

print("\nGuardando objeto completo (con doublets marcados) en (RBC-out):")
print(" ", OUT_FULL)
adata_full[~adata_full.obs[LEVEL1_KEY].astype(str).isin(RBC_LIKE_L1)].write_h5ad(OUT_FULL, compression="gzip")

print("\nGuardando objeto filtrado (sin doublets, con Level1_refined) en (ya RBC-out):")
print(" ", OUT_FILTER)
adata_filt.write_h5ad(OUT_FILTER, compression="gzip")

print("\n[OK] Guardado completado.")


### 10. Check final

In [None]:
print("=== CHECK FINAL ===")
print("Objeto completo (en RAM) :", adata_full)
print("Objeto filtrado (en RAM) :", adata_filt)

print("\nLevel1_refined en adata_filt:")
print(adata_filt.obs["Level1_refined"].value_counts())

print("\nDoublets en adata_full (deberían ser > 0) y en adata_filt (debería ser 0):")
print("doublet_like True en full :", int(adata_full.obs["doublet_like"].sum()))
print("doublet_like True en filt :", int(adata_filt.obs["doublet_like"].sum()))

print("\nRBC-like en adata_filt (debería ser 0):", int(adata_filt.obs[LEVEL1_KEY].astype(str).isin(RBC_LIKE_L1).sum()))
print("Conv_T_other count (filtrado):", int(adata_filt.obs[LEVEL2_KEY].astype(str).value_counts().get("Conv_T_other", 0)))
print("DC3 count (filtrado):", int(adata_filt.obs[LEVEL2_KEY].astype(str).value_counts().get("DC3", 0)))
