### Imports + rutas del repo

In [None]:
from pathlib import Path
import scanpy as sc
import anndata as ad

from src.paths import project_paths

print("Scanpy version:", sc.__version__)
print("AnnData version:", ad.__version__)

P = project_paths(Path.cwd())

PROJECT_ROOT = P["PROJECT_ROOT"]
CONFIG_DIR   = P["CONFIG_DIR"]
DATA_DIR     = P["DATA_DIR"]
RESULTS_DIR  = P["RESULTS_DIR"]
FIGURES_DIR  = P["FIGURES_DIR"]

for d in [CONFIG_DIR, DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("\nPROJECT_ROOT:", PROJECT_ROOT)
print("CONFIG_DIR:", CONFIG_DIR)
print("DATA_DIR:", DATA_DIR)
print("RESULTS_DIR:", RESULTS_DIR)
print("FIGURES_DIR:", FIGURES_DIR)

### Cargar config/config.yaml

def load_simple_yaml(path: Path) -> dict:
    """
    Parser mínimo para YAML simple: lineas 'key: value' (strings con o sin comillas).
    Suficiente para este proyecto (sin listas/objetos).
    """
    cfg = {}
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip().strip('"').strip("'")
        cfg[k] = v
    return cfg

cfg_path = CONFIG_DIR / "config.yaml"
if not cfg_path.exists():
    raise FileNotFoundError(
        f"Falta {cfg_path}. Debe existir en el repo.\n"
        "Crea config/config.yaml (lo tienes en la guía de cambios)."
    )

CFG = load_simple_yaml(cfg_path)
CFG

### Localizar .h5ad

In [None]:
RAW_H5AD_FILENAME = CFG["raw_h5ad_filename"]
BACKED_MODE = CFG.get("backed_mode", "r")

RAW_H5AD_PATH = DATA_DIR / RAW_H5AD_FILENAME

print("Archivo esperado .h5ad (LOCAL, NO se sube):")
print(RAW_H5AD_PATH)
print("¿Existe?:", RAW_H5AD_PATH.exists())

### Carga “backed” y ficha técnica

In [None]:
adata_raw = None

if not RAW_H5AD_PATH.exists():
    print(
        "\n[INFO] No se encontró el .h5ad. Este notebook sigue siendo válido como ficha técnica.\n"
        "Para ejecutarlo completo, coloca el .h5ad en la carpeta data/ (local) con el nombre exacto:\n"
        f"- {RAW_H5AD_FILENAME}"
    )
else:
    print("\nLeyendo .h5ad en modo backed =", BACKED_MODE)
    adata_raw = sc.read_h5ad(RAW_H5AD_PATH, backed=BACKED_MODE)

    print("\nResumen AnnData:")
    print(adata_raw)

    print("\nNº células:", adata_raw.n_obs)
    print("Nº genes:", adata_raw.n_vars)

    # Columnas (sin listar valores sensibles)
    print("\nColumnas en obs (primeras 30):")
    print(list(adata_raw.obs.columns)[:30])

    print("\nColumnas en var (primeras 30):")
    print(list(adata_raw.var.columns)[:30])

    # Resúmenes agregados
    if "disease" in adata_raw.obs.columns:
        print("\nDistribución de disease:")
        print(adata_raw.obs["disease"].value_counts(dropna=False))

    if "scrublet_predicted_doublet" in adata_raw.obs.columns:
        print("\nDistribución de scrublet_predicted_doublet:")
        print(adata_raw.obs["scrublet_predicted_doublet"].value_counts(dropna=False))

    if "patientID" in adata_raw.obs.columns:
        print("\nNúmero de pacientes únicos (NO se listan IDs):")
        print(adata_raw.obs["patientID"].nunique())

    # Cierra el backed file
    try:
        adata_raw.file.close()
    except Exception:
        pass