In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from src.paths import project_paths

# repo
P = project_paths(Path.cwd())
PROJECT_ROOT = P.PROJECT_ROOT
RESULTS_DIR = P.RESULTS_DIR
FIGURES_DIR = P.FIGURES_DIR

SUM_DIR = RESULTS_DIR / "summary_tables"
SUM_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = PROJECT_ROOT / "final_deliverables"
OUT_DIR.mkdir(exist_ok=True)

legacy_sum = PROJECT_ROOT / "summary_tables_final"
legacy_fig = PROJECT_ROOT / "figures_final"

def pick_first_existing(candidates, label: str) -> Path:
    for p in candidates:
        if p is not None and Path(p).exists():
            return Path(p)
    raise FileNotFoundError(
        f"No encuentro {label}. Probé:\n" + "\n".join([f" - {Path(p)}" for p in candidates if p is not None])
    )

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SUM_DIR      :", SUM_DIR, "| exists:", SUM_DIR.exists())
print("FIGURES_DIR  :", FIGURES_DIR, "| exists:", FIGURES_DIR.exists())
print("OUT_DIR      :", OUT_DIR, "| exists:", OUT_DIR.exists())
print("legacy_sum   :", legacy_sum, "| exists:", legacy_sum.exists())
print("legacy_fig   :", legacy_fig, "| exists:", legacy_fig.exists())

In [None]:
from IPython.display import display


# Inputs

S1_PATH = pick_first_existing(
    [
        SUM_DIR / "DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv",
        legacy_sum / "DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv",
    ],
    label="DE_pseudobulk_FOR_REPORT_LINFO_CLEAN_top10_by_celltype.csv"
)

S2_PATH = pick_first_existing(
    [
        SUM_DIR / "cell_counts_Level1refined_by_patient.csv",
        legacy_sum / "cell_counts_Level1refined_by_patient.csv",
    ],
    label="cell_counts_Level1refined_by_patient.csv"
)

print("S1_PATH:", S1_PATH)
print("S2_PATH:", S2_PATH)


# Load Table S1 (DE top10)

df_s1 = pd.read_csv(S1_PATH)

# arreglar coma decimal -> punto en columnas numéricas
num_cols_s1 = [c for c in ["log2FC_Healthy_vs_Cirrhosis", "FDR"] if c in df_s1.columns]
for c in num_cols_s1:
    df_s1[c] = (
        df_s1[c].astype(str)
        .str.replace(",", ".", regex=False)
        .replace("nan", np.nan)
    )
    df_s1[c] = pd.to_numeric(df_s1[c], errors="coerce")

# ordenar para lectura: Level1_refined -> direction -> FDR
sort_cols = [c for c in ["Level1_refined", "direction", "FDR"] if c in df_s1.columns]
if sort_cols:
    df_s1 = df_s1.sort_values(sort_cols, ascending=[True, True, True]).reset_index(drop=True)

# Load Table S2 (counts per patient)

df_s2 = pd.read_csv(S2_PATH)

# orden recomendado de columnas
preferred_ct = ["B", "DC", "HSCs", "Mono", "NK", "Plasma", "T", "pDC"]
front = [c for c in ["patientID", "disease"] if c in df_s2.columns]
cts   = [c for c in preferred_ct if c in df_s2.columns]
rest  = [c for c in df_s2.columns if c not in front + cts]
if "total_cells_patient" in rest:
    rest = [c for c in rest if c != "total_cells_patient"] + ["total_cells_patient"]

df_s2 = df_s2[front + cts + rest]

if "disease" in df_s2.columns and "patientID" in df_s2.columns:
    df_s2 = df_s2.sort_values(["disease", "patientID"]).reset_index(drop=True)


# Display en notebook

style_s1 = (
    df_s1.style
        .format(
            {c: "{:.3f}" for c in ["log2FC_Healthy_vs_Cirrhosis"] if c in df_s1.columns}
            | {c: "{:.2e}" for c in ["FDR"] if c in df_s1.columns},
            na_rep=""
        )
        .set_caption("Table S1. Top differentially expressed genes by lineage (pseudobulk)")
)
style_s2 = (
    df_s2.style
        .set_caption("Table S2. Per-patient cell counts by lineage (Level1_refined)")
        .format(na_rep="")
)

display(style_s1)
display(style_s2)

# Exportar Excel

PRETTY_XLSX = OUT_DIR / "Tables_S1_S2_pretty.xlsx"
try:
    with pd.ExcelWriter(PRETTY_XLSX, engine="openpyxl") as writer:
        df_s1.to_excel(writer, sheet_name="TableS1_DE_top10", index=False)
        df_s2.to_excel(writer, sheet_name="TableS2_counts_by_patient", index=False)
    print("Saved pretty Excel:", PRETTY_XLSX)
except Exception as e:
    print("Excel export skipped:", e)

In [None]:
# Export: Table S1 - 1 page per lineage (PDF)

required_cols = {"Level1_refined", "direction", "gene"}
missing = required_cols - set(df_s1.columns)
if missing:
    raise KeyError(f"df_s1 debe tener columnas {required_cols}. Faltan: {missing}")

pdf_path = OUT_DIR / "TableS1_pretty_by_lineage.pdf"
lineages = df_s1["Level1_refined"].dropna().astype(str).unique().tolist()

# columnas para imprimir (solo si existen)
cols_keep = [c for c in ["direction", "gene", "log2FC_Healthy_vs_Cirrhosis", "FDR"] if c in df_s1.columns]

with PdfPages(pdf_path) as pdf:
    for lin in lineages:
        sub = df_s1[df_s1["Level1_refined"].astype(str) == lin].copy()

        if cols_keep:
            sub = sub[cols_keep]

        fig, ax = plt.subplots(figsize=(11.7, 8.3))
        ax.axis("off")
        ax.set_title(f"Table S1 — {lin} (pseudobulk Healthy vs Cirrhosis)", pad=12)

        table = ax.table(
            cellText=sub.values,
            colLabels=sub.columns,
            loc="center",
            cellLoc="left",
            colLoc="left",
        )
        table.auto_set_font_size(False)
        table.set_fontsize(8)
        table.scale(1, 1.2)

        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

print("Saved:", pdf_path)

In [None]:
# Export: Table S1 representative (top N per direction) PNG

N_PER_DIRECTION = 3

need = {"Level1_refined", "direction", "FDR"}
missing = need - set(df_s1.columns)
if missing:
    raise KeyError(f"df_s1 debe tener columnas {need}. Faltan: {missing}")

df_repr = (
    df_s1.sort_values(["Level1_refined", "direction", "FDR"], ascending=[True, True, True])
        .groupby(["Level1_refined", "direction"], as_index=False)
        .head(N_PER_DIRECTION)
        .copy()
)

dir_order = ["Higher_in_Healthy", "Higher_in_Cirrhosis"]
df_repr["direction"] = pd.Categorical(df_repr["direction"].astype(str), categories=dir_order, ordered=True)
df_repr = df_repr.sort_values(["Level1_refined", "direction", "FDR"])

rename_map = {
    "Level1_refined": "Lineage",
    "direction": "Direction",
    "gene": "Gene",
    "log2FC_Healthy_vs_Cirrhosis": "log2FC (Healthy vs Cirrhosis)",
    "FDR": "FDR",
}
cols_final = [c for c in ["Level1_refined", "direction", "gene", "log2FC_Healthy_vs_Cirrhosis", "FDR"] if c in df_repr.columns]
df_repr = df_repr[cols_final].rename(columns=rename_map)

if "log2FC (Healthy vs Cirrhosis)" in df_repr.columns:
    df_repr["log2FC (Healthy vs Cirrhosis)"] = df_repr["log2FC (Healthy vs Cirrhosis)"].map(lambda x: "" if pd.isna(x) else f"{x:.3f}")
if "FDR" in df_repr.columns:
    df_repr["FDR"] = df_repr["FDR"].map(lambda x: "" if pd.isna(x) else f"{x:.2e}")

display(df_repr)

fig, ax = plt.subplots(figsize=(11.7, 8.3))
ax.axis("off")
ax.set_title(
    f"Representative DE genes per lineage (pseudobulk; top {N_PER_DIRECTION} per direction by FDR)",
    pad=12
)

table = ax.table(
    cellText=df_repr.values,
    colLabels=df_repr.columns,
    loc="center",
    cellLoc="left",
    colLoc="left",
)

table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.2)

out_png = OUT_DIR / f"TableS1_representative_top{N_PER_DIRECTION}_per_direction.png"
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close(fig)

print("Saved:", out_png)

In [None]:
# Export: Table S2 (cell counts) PNG
df = df_s2.copy()

disease_order = ["Cirrhosis", "Healthy"]
if "disease" in df.columns:
    df["disease"] = pd.Categorical(df["disease"].astype(str), categories=disease_order, ordered=True)
    if "patientID" in df.columns:
        df = df.sort_values(["disease", "patientID"])

preferred_cols = ["patientID", "disease", "total_cells_patient", "B", "Plasma", "pDC", "T", "NK", "Mono", "DC", "HSCs"]
present_cols = [c for c in preferred_cols if c in df.columns] + [c for c in df.columns if c not in preferred_cols]
df = df[present_cols]

display(df)

fig, ax = plt.subplots(figsize=(12, 4.5))
ax.axis("off")
ax.set_title("Per-patient cell counts (Level1_refined)", pad=12)

table = ax.table(
    cellText=df.values,
    colLabels=df.columns,
    loc="center",
    cellLoc="center",
    colLoc="center",
)

table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.25)

out_png = OUT_DIR / "TableS2_cell_counts_Level1refined_by_patient.png"
plt.savefig(out_png, dpi=300, bbox_inches="tight")
plt.close(fig)

print("Saved:", out_png)