In [None]:
import pandas as pd
from pathlib import Path

from src.paths import project_paths

P = project_paths(Path.cwd())
PROJECT_ROOT = P.PROJECT_ROOT
RESULTS_DIR = P.RESULTS_DIR

SUM_DIR = RESULTS_DIR / "summary_tables"
SUM_DIR.mkdir(parents=True, exist_ok=True)

legacy_sum = PROJECT_ROOT / "summary_tables_final"

def pick_first_existing(candidates, label: str) -> Path:
    for p in candidates:
        if p is not None and Path(p).exists():
            return Path(p)
    raise FileNotFoundError(
        f"No encuentro {label}. Probé:\n" + "\n".join([f" - {Path(p)}" for p in candidates if p is not None])
    )

# Level1refined; fallback a Level2final
csv_path = pick_first_existing(
    [
        SUM_DIR / "cell_proportions_Level1refined_by_patient.csv",
        legacy_sum / "cell_proportions_Level1refined_by_patient.csv",
        SUM_DIR / "cell_proportions_Level2final_by_patient.csv",
        legacy_sum / "cell_proportions_Level2final_by_patient.csv",
    ],
    label="cell_proportions_{Level1refined,Level2final}_by_patient.csv"
)

TABLE_TAG = "Level1refined"
out_dir = SUM_DIR

print("csv_path:", csv_path)
print("out_dir :", out_dir)

In [None]:
# Leer WIDE
df_wide = pd.read_csv(csv_path)

needed = {"patientID", "disease"}
missing = needed - set(df_wide.columns)
if missing:
    raise KeyError(f"Faltan columnas en {csv_path}: {missing}")

# wide -> long (patientID, disease, celltype, proportion)
long = df_wide.melt(
    id_vars=["patientID", "disease"],
    var_name="celltype",
    value_name="proportion",
)

long["proportion"] = pd.to_numeric(long["proportion"], errors="coerce")

# Stats por (celltype × disease): mean, median, q25, q75
df = (
    long.groupby(["celltype", "disease"], as_index=False)["proportion"]
        .agg(
            mean_prop="mean",
            median_prop="median",
            q25_prop=lambda x: x.quantile(0.25),
            q75_prop=lambda x: x.quantile(0.75),
        )
)

AS_PERCENT = True
DECIMALS = 1

def fmt(x):
    if pd.isna(x):
        return ""
    x = x * 100 if AS_PERCENT else x
    return f"{x:.{DECIMALS}f}"

df["median_IQR"] = df.apply(
    lambda r: f"{fmt(r['median_prop'])} ({fmt(r['q25_prop'])}–{fmt(r['q75_prop'])})",
    axis=1
)

In [None]:
# Pivot a celltype × disease
tab = (
    df.pivot(index="celltype", columns="disease", values="median_IQR")
      .reset_index()
)

col_order = ["celltype"] + [c for c in ["Healthy", "Cirrhosis"] if c in tab.columns] + \
            [c for c in tab.columns if c not in {"celltype", "Healthy", "Cirrhosis"}]
tab = tab[col_order].sort_values("celltype")

# Exportar
html_path = out_dir / f"Table_median_IQR_{TABLE_TAG}.html"
tsv_path  = out_dir / f"Table_median_IQR_{TABLE_TAG}.tsv"

tab.to_html(html_path, index=False)
tab.to_csv(tsv_path, sep="\t", index=False)

print("OK:")
print("HTML:", html_path)
print("TSV :", tsv_path)

display(tab.head(10))

In [None]:
# DOCX
try:
    from docx import Document

    docx_path = out_dir / f"Table_median_IQR_{TABLE_TAG}.docx"
    doc = Document()
    doc.add_paragraph(
        "Tabla. Proporciones por paciente (mediana [q25–q75]) por subpoblación y condición."
    )

    table = doc.add_table(rows=1, cols=len(tab.columns))
    hdr = table.rows[0].cells
    for j, col in enumerate(tab.columns):
        hdr[j].text = str(col)

    for _, row in tab.iterrows():
        cells = table.add_row().cells
        for j, col in enumerate(tab.columns):
            cells[j].text = str(row[col])

    doc.save(docx_path)
    print("DOCX:", docx_path)
except Exception as e:
    print("DOCX export skipped:", e)