In [None]:
# ============================================================
# Conv_T_other_cleanup.ipynb  (ACTUALIZADO post-NBXX)
# Conv_T_other — caracterización + renombrado automático (Level2_final_map)
# - NO re-etiqueta todo el Level2: solo resuelve Conv_T_other
# - Memory-safe: abre backed="r" y solo carga T-cells x panel genes a RAM
# - ROBUSTO a var_names != símbolos: usa adata.var['symbol'] si existe
# - Salidas: QA tables + JSON mapping para usar en figuras/Downstream
# ============================================================

from pathlib import Path
import json
import numpy as np
import pandas as pd
import scanpy as sc

NOTEBOOK_DIR = Path.cwd()

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data_processed").exists():
            return p
    raise FileNotFoundError(f"No encuentro 'data_processed' subiendo desde: {start}")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
DATA_PROCESSED = PROJECT_ROOT / "data_processed"
IN_PATH = DATA_PROCESSED / "TFM_CIRRHOSIS_main_filtered_for_analysis.h5ad"

OUT_SUMMARY = PROJECT_ROOT / "summary_tables_final"
OUT_SUMMARY.mkdir(exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("IN_PATH     :", IN_PATH)
print("OUT_SUMMARY :", OUT_SUMMARY)

if not IN_PATH.exists():
    raise FileNotFoundError(f"No existe IN_PATH:\n{IN_PATH}")

# -----------------------------
# Helpers: mapping símbolo -> varname
# -----------------------------
def symbol_to_varname(adata, symbol: str):
    # 1) si var_names ya son símbolos
    if symbol in adata.var_names:
        return symbol
    # 2) si hay columna var['symbol']
    if "symbol" in adata.var.columns:
        sym = adata.var["symbol"].astype(str).to_numpy()
        hits = np.where(sym == symbol)[0]
        if hits.size > 0:
            return adata.var_names[hits[0]]
    return None

def symbols_to_varnames(adata, symbols):
    out = []
    for s in symbols:
        out.append(symbol_to_varname(adata, s))
    return out

# -----------------------------
# Cargar en backed
# -----------------------------
adata_b = sc.read_h5ad(IN_PATH, backed="r")

try:
    # checks
    needed = ["patientID", "Level1_refined", "Level2"]
    missing = [c for c in needed if c not in adata_b.obs.columns]
    if missing:
        raise KeyError(f"Faltan columnas en obs: {missing}")

    # RBC-out sanity (informativo)
    rbc_n = int((adata_b.obs["Level2"].astype(str) == "RBC").sum())
    print("RBC cells (should be 0):", rbc_n)

    obs = adata_b.obs[["patientID", "Level1_refined", "Level2"]].copy()
    obs["patientID"] = obs["patientID"].astype(str)
    obs["Level1_refined"] = obs["Level1_refined"].astype(str)
    obs["Level2"] = obs["Level2"].astype(str)

    # Subset: SOLO T (Level1_refined == T)
    mask_T = (obs["Level1_refined"].values == "T")
    nT = int(mask_T.sum())
    print("T cells:", nT)
    if nT == 0:
        raise ValueError("No hay células T en Level1_refined; esto no debería pasar.")

    # ¿Existe Conv_T_other?
    n_conv = int((obs.loc[mask_T, "Level2"].values == "Conv_T_other").sum())
    print("Conv_T_other in T:", n_conv)
    if n_conv == 0:
        raise ValueError("No hay Conv_T_other en el objeto. Si ya lo eliminaste/renombraste, este notebook no aplica.")

    # -----------------------------
    # Panel de genes (firmas) para decidir qué es Conv_T_other
    # -----------------------------
    signatures = {
        "Naive_Memory_like": ["IL7R","CCR7","LTB","TCF7","LEF1","MALAT1"],
        "Cytotoxic_like":    ["NKG7","GNLY","PRF1","GZMB","FGFBP2"],
        "Treg_like":         ["FOXP3","IL2RA","CTLA4","IKZF2","TNFRSF18"],
        "Exhausted_like":    ["PDCD1","LAG3","TIGIT","TOX","HAVCR2"],
        "Proliferative_like":["MKI67","TOP2A","STMN1","HMGB2","TYMS"],
        "ISG_like":          ["ISG15","IFIT1","IFIT3","MX1","OAS1"],
    }

    markers_extra = [
        "TRAC","TRBC1","TRBC2","CD3D","CD3E",
        "CD4","CD8A","CD8B",
        "KLRD1","FCGR3A",
        "JUN","FOS","IL32","HLA-DRA"
    ]

    panel_symbols = []
    for _, genes in signatures.items():
        panel_symbols += genes
    panel_symbols += markers_extra
    panel_symbols = list(dict.fromkeys(panel_symbols))  # dedup manteniendo orden

    # map symbol -> varname (robusto a Ensembl)
    panel_varnames = symbols_to_varnames(adata_b, panel_symbols)
    kept = [(s, v) for s, v in zip(panel_symbols, panel_varnames) if v is not None]
    missing_genes = [s for s, v in zip(panel_symbols, panel_varnames) if v is None]

    if len(kept) == 0:
        raise RuntimeError(
            "No se pudo mapear ningún gen del panel a var_names (ni por var['symbol']). "
            "Revisa adata.var_names / adata.var['symbol']."
        )

    present_symbols = [s for s, _ in kept]
    present_varnames = [v for _, v in kept]
    varname_to_symbol = {v: s for s, v in kept}

    print("Panel genes mapped:", len(present_symbols), "| missing:", len(missing_genes))
    if missing_genes:
        print("[INFO] Missing genes (ok):", missing_genes)

    # -----------------------------
    # Cargar a memoria SOLO T cells x panel genes
    # -----------------------------
    idx_T = np.where(mask_T)[0]
    adata_small = adata_b[idx_T, present_varnames].to_memory()

finally:
    # cerrar backed siempre
    try:
        adata_b.file.close()
    except Exception:
        pass

# Layer preferida
LAYER = "log1p_10k"
if LAYER not in adata_small.layers.keys():
    raise KeyError(f"No existe layer {LAYER} en el objeto; layers={list(adata_small.layers.keys())}")

# tabla expresión por célula (panel) usando varnames, luego renombrar a símbolos
df_expr = sc.get.obs_df(adata_small, keys=present_varnames, layer=LAYER, use_raw=False)
df_expr = df_expr.rename(columns=varname_to_symbol)

df_expr["Level2"] = adata_small.obs["Level2"].astype(str).values
df_expr["patientID"] = adata_small.obs["patientID"].astype(str).values

# -----------------------------
# 1) Medias por Level2 (dentro de T) para marcadores del panel
# -----------------------------
means_by_l2 = df_expr.groupby("Level2")[present_symbols].mean()

top_rows = []
for l2 in means_by_l2.index:
    s = means_by_l2.loc[l2].sort_values(ascending=False).head(25)
    for g, v in s.items():
        top_rows.append([l2, g, float(v)])

top_df = pd.DataFrame(top_rows, columns=["Level2", "gene", "mean_log1p"])
out_top = OUT_SUMMARY / "QA_ConvT_other_topmarkers_meanlog1p.csv"
top_df.to_csv(out_top, index=False)
print("Saved:", out_top)

# -----------------------------
# 2) Scores de firmas por Level2 (media de genes de cada firma)
# -----------------------------
score_rows = []
for l2 in means_by_l2.index:
    for sig, genes in signatures.items():
        genes_ok = [g for g in genes if g in means_by_l2.columns]
        if len(genes_ok) == 0:
            score = np.nan
        else:
            score = float(means_by_l2.loc[l2, genes_ok].mean())
        score_rows.append([l2, sig, score, len(genes_ok)])

scores = pd.DataFrame(score_rows, columns=["Level2","signature","score_meanlog1p","n_genes_used"])
scores_pivot = scores.pivot(index="Level2", columns="signature", values="score_meanlog1p")

out_scores = OUT_SUMMARY / "QA_ConvT_other_signature_scores.csv"
scores_pivot.to_csv(out_scores)
print("Saved:", out_scores)

print("\n=== Signature scores (Conv_T_other row) ===")
if "Conv_T_other" in scores_pivot.index:
    print(scores_pivot.loc[["Conv_T_other"]])
else:
    raise RuntimeError("No se encontró fila 'Conv_T_other' en scores_pivot (esto no debería pasar).")

# -----------------------------
# 3) Decisión automática (reproducible) de qué hacer con Conv_T_other
# -----------------------------
conv_scores = scores_pivot.loc["Conv_T_other"].dropna()
if conv_scores.empty:
    raise RuntimeError("Conv_T_other tiene todos los scores NaN; no se puede decidir mapping.")

best_sig = conv_scores.sort_values(ascending=False).index[0]
best_val = float(conv_scores.loc[best_sig])

# mapping final SOLO para Conv_T_other
if best_sig == "Naive_Memory_like":
    new_label = "CD4_Memory"
elif best_sig == "Cytotoxic_like":
    new_label = "CD8_Effector_Cytotoxic"     # merge
elif best_sig == "Treg_like":
    new_label = "Treg"                       # merge
elif best_sig == "Exhausted_like":
    new_label = "Exhausted_T"                # merge
elif best_sig == "Proliferative_like":
    new_label = "Proliferative_T"            # merge
elif best_sig == "ISG_like":
    new_label = "ISG_T"
else:
    new_label = "T_Other"

level2_map = {"Conv_T_other": new_label}

# Guardar mapping
out_map = OUT_SUMMARY / "Level2_final_map.json"
with open(out_map, "w", encoding="utf-8") as f:
    json.dump(level2_map, f, indent=2)

# Guardar decisión en TXT
out_dec = OUT_SUMMARY / "ConvT_other_decision.txt"
with open(out_dec, "w", encoding="utf-8") as f:
    f.write("Conv_T_other decision\n")
    f.write(f"- best_signature: {best_sig}\n")
    f.write(f"- best_score_meanlog1p: {best_val:.6f}\n")
    f.write(f"- new_label: {new_label}\n")
    f.write(f"- n_cells_Conv_T_other (in T): {n_conv}\n")

print("\n=== DECISIÓN FINAL ===")
print("best_signature:", best_sig)
print("best_score_meanlog1p:", best_val)
print("Conv_T_other ->", new_label)
print("Saved:", out_map)
print("Saved:", out_dec)

print("\n[OK] Conv_T_other caracterizado + mapping final generado.")