### 1. Imports + rutas del repo

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

import matplotlib.pyplot as plt

from src.paths import project_paths

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

P = project_paths(Path.cwd())
PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

QC_RESULTS_DIR = RESULTS_DIR / "qc"
QC_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

PREP_RESULTS_DIR = RESULTS_DIR / "preprocess"
PREP_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

HVG_FIG_DIR = FIGURES_DIR / "hvg"
HVG_FIG_DIR.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("QC_RESULTS_DIR:", QC_RESULTS_DIR)
print("PREP_RESULTS_DIR:", PREP_RESULTS_DIR)
print("HVG_FIG_DIR:", HVG_FIG_DIR)

### 2. Leer config/config.yaml

In [None]:
def load_simple_yaml(path: Path) -> dict:
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

def as_int(x: str) -> int:
    return int(float(x))

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(f"Falta {cfg_path}. Debe existir en el repo.")

CFG = load_simple_yaml(cfg_path)

AFTER_QC_FILENAME = CFG["after_qc_h5ad_filename"]
NORMALIZED_FILENAME = CFG["normalized_h5ad_filename"]

NORM_TARGET_SUM = as_int(CFG["norm_target_sum"])

HVG_N_TOP = as_int(CFG["hvg_n_top_genes"])
HVG_FLAVOR = CFG.get("hvg_flavor", "seurat_v3")
HVG_BATCH_KEY = CFG.get("hvg_batch_key", "").strip()

CFG

### 3. Cargar input (salida del QC)

In [None]:
qc_h5ad_path = QC_RESULTS_DIR / AFTER_QC_FILENAME

if not qc_h5ad_path.exists():
    raise FileNotFoundError(
        f"No se encuentra el archivo tras QC en:\n{qc_h5ad_path}\n\n"
        "Ejecuta primero el Notebook 02 (QC) para generarlo."
    )

print("Leyendo input tras QC:", qc_h5ad_path)
adata = sc.read_h5ad(qc_h5ad_path)
print(adata)

### 4. Guardar cuentas crudas en layers["counts"]

In [None]:
if "counts" not in adata.layers:
    X = adata.X
    adata.layers["counts"] = X.copy() if hasattr(X, "copy") else np.array(X, copy=True)
    print("Capa 'counts' creada a partir de adata.X.")
else:
    print("Capa 'counts' ya existe; no se sobrescribe.")

### 5. Normalización + log1p + guardar en layers["log1p_10k"]

In [None]:
sc.pp.normalize_total(adata, target_sum=NORM_TARGET_SUM, inplace=True)
sc.pp.log1p(adata)

adata.layers["log1p_10k"] = adata.X.copy()

print("Normalización + log1p completadas.")
print("Capa 'log1p_10k' creada.")

### 6. HVG

In [None]:
if HVG_FLAVOR == "seurat_v3":
    try:
        import skmisc  # noqa: F401
    except ImportError as e:
        raise ImportError(
            "Falta la dependencia 'scikit-misc' necesaria para flavor='seurat_v3'.\n"
            "Instala en tu entorno (fuera del notebook) y reintenta:\n"
            "  pip install scikit-misc\n"
            "o\n"
            "  conda install -c conda-forge scikit-misc"
        ) from e

if "counts" not in adata.layers:
    raise RuntimeError("No existe adata.layers['counts']. Debe contener las cuentas crudas.")

batch_key = None
if HVG_BATCH_KEY and HVG_BATCH_KEY.lower() != "none":
    if HVG_BATCH_KEY in adata.obs.columns and adata.obs[HVG_BATCH_KEY].nunique() > 1:
        batch_key = HVG_BATCH_KEY

print("HVG flavor:", HVG_FLAVOR)
print("HVG n_top_genes:", HVG_N_TOP)
print("HVG batch_key usado:", batch_key)

hvg_kwargs = dict(
    flavor=HVG_FLAVOR,
    n_top_genes=HVG_N_TOP,
    layer="counts",
    inplace=True,
)
if batch_key is not None:
    hvg_kwargs["batch_key"] = batch_key

sc.pp.highly_variable_genes(adata, **hvg_kwargs)

print("Columnas HVG en var:")
print([c for c in adata.var.columns if "highly_variable" in c])
print("HVG iniciales:", int(adata.var["highly_variable"].sum()))

### 7. Excluir MT/RPL/RPS/HB/HLA de HVG

In [None]:
var = adata.var
gene_names = var.index.to_series().astype(str).str.upper()

if "mt" in var.columns:
    mt_genes = var["mt"].astype(bool)
else:
    mt_genes = gene_names.str.startswith("MT-")

rpl_genes = gene_names.str.startswith("RPL")
rps_genes = gene_names.str.startswith("RPS")
hb_genes  = gene_names.str.startswith("HB")
hla_genes = gene_names.str.startswith("HLA-")

exclude_mask = mt_genes | rpl_genes | rps_genes | hb_genes | hla_genes

n_before = int(var["highly_variable"].sum())
adata.var.loc[exclude_mask, "highly_variable"] = False
n_after = int(var["highly_variable"].sum())

print("HVG antes excluir:", n_before)
print("HVG después excluir:", n_after)
print("Genes marcados como excluibles (MT/RPL/RPS/HB/HLA):", int(exclude_mask.sum()))

hvgs = adata.var.index[adata.var["highly_variable"]].tolist()
print("\nPrimeros 20 HVG finales:")
print(hvgs[:20])

### 8. Figura HVG

In [None]:
sc.pl.highly_variable_genes(adata, show=False)
plt.savefig(HVG_FIG_DIR / "hvg_summary.png", bbox_inches="tight", dpi=200)
plt.close()
print("Figura guardada en:", HVG_FIG_DIR / "hvg_summary.png")

### 9. Guardar salida normalizada

In [None]:
out_path = PREP_RESULTS_DIR / NORMALIZED_FILENAME
adata.write_h5ad(out_path)

print("Objeto normalizado + HVG guardado en:")
print(out_path)