In [8]:
# -*- coding: utf-8 -*-
# ============================================================
# Unified Performance Report (HINT→CTOD & HINT→HINT)
# - Gera TODAS as figuras e tabelas
# - Exporta CSVs/PNGs e compila um HTML final (estático)
# ============================================================

from pathlib import Path
import json, joblib, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse as sp
import xgboost as xgb
from string import Template
import base64
import re

from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score,
    precision_recall_fscore_support, confusion_matrix,
    roc_curve, precision_recall_curve
)
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

# ---------------- CONFIG ----------------
PHASES = ["phase_I", "phase_II", "phase_III"]

# Ajuste estes caminhos para o seu ambiente:
MODEL_BASE     = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/xgb_model_package")
CTOD_ARTIFACTS = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/ctod_xgb_artifacts")
HINT_ARTIFACTS = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/hint_xgb_artifacts")

OUT_BASE   = Path("report")              # pasta do relatório
FIG_DIR    = OUT_BASE / "fig"
TAB_DIR    = OUT_BASE / "tables"
OUT_BASE.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- HELPERS (dados/modelos) ----------------
def _first_existing(*paths: Path) -> Path:
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError("Artifact path not found:\n" + "\n".join(map(str, paths)))

def load_ctod_test(phase: str):
    pdir = CTOD_ARTIFACTS / phase
    X = sp.load_npz(pdir / "X_test_joined_reduced.npz")
    y = np.load(pdir / "y_test_joined.npy")
    return X, y

def load_hint_test(phase: str):
    base1 = HINT_ARTIFACTS / phase
    base2 = HINT_ARTIFACTS / "phases_reduced" / phase
    Xp = _first_existing(base1 / "X_test_reduced.npz", base2 / "X_test_reduced.npz")
    yp = _first_existing(base1 / "y_test.npy",         base2 / "y_test.npy")
    X = sp.load_npz(Xp); y = np.load(yp)
    return X, y

def load_hint_model(phase: str):
    mdir = MODEL_BASE / phase
    jl = mdir / "hint_xgb_model.joblib"
    js = mdir / "xgb_model.json"
    if jl.exists():
        return joblib.load(jl)
    booster = xgb.Booster(); booster.load_model(str(js))
    mdl = xgb.XGBClassifier(); mdl._Booster = booster
    return mdl

# ---- Feature-name loader (tenta vários nomes de ficheiro) ----
def _strip_leading_index(s: str) -> str:
    s = s.strip()
    m = re.match(r'^\s*\d+\s*"?(.+?)"?\s*$', s)
    if m:
        return m.group(1)
    return s.strip('"')

def load_feature_names(phase: str):
    base = MODEL_BASE / phase
    candidates = [
        base / "feature_names_reduced_with_top_original",
        base / "feature_names_reduced_with_top_original.json",
        base / "feature_names_reduced.json",
        base / "feature_names.json",
        base / "feature_names.txt",
    ]
    for path in candidates:
        if path.exists():
            # Tentar JSON
            try:
                with open(path) as f:
                    data = json.load(f)
                if isinstance(data, list) and all(isinstance(x, str) for x in data):
                    return data
            except Exception:
                pass
            # Tentar ficheiro linha-a-linha
            try:
                with open(path) as f:
                    lines = [ln.strip() for ln in f if ln.strip()]
                return [_strip_leading_index(ln) for ln in lines]
            except Exception:
                pass
    print(f"[WARN] Feature names file not found for {phase} (checked common candidates).")
    return None

def map_f_to_name(fkey: str, feat_names):
    if not (fkey and fkey.startswith("f")) or feat_names is None:
        return fkey
    try:
        idx = int(fkey[1:])
        if 0 <= idx < len(feat_names):
            return feat_names[idx]
    except Exception:
        pass
    return fkey

# ---------------- LIMPEZA DE NOMES (para plots de top features) ----------------
SVD_TOKENS_RE = re.compile(r'^(svd(_component)?(_?\d+)?)$', re.IGNORECASE)

def _collapse_adjacent_dupes(parts):
    out = []
    for p in parts:
        if not out or out[-1] != p:
            out.append(p)
    return out

def _collapse_full_repeat(tokens):
    n = len(tokens)
    if n <= 1:
        return tokens
    for k in range(1, n // 2 + 1):
        if n % k == 0 and tokens == tokens[:k] * (n // k):
            return tokens[:k]
    return tokens

def clean_readable(name: str) -> str:
    """
    Torna nomes técnicos mais legíveis e compactos:
      - Remove repetições adjacentes e subsequências repetidas
      - Converte '__' para ' · '
      - Limpa _svd_*, 'd29' -> 'comp 29'
      - Evita duplicar blocos (POS/SEM/ICD/...)
    """
    s = str(name).strip().strip('"')
    parts = [p for p in s.split("__") if p]
    parts = _collapse_adjacent_dupes(parts)
    parts = _collapse_full_repeat(parts)

    # remover marcadores SVD e componentes dN
    cleaned = []
    for p in parts:
        if SVD_TOKENS_RE.match(p):
            continue
        if re.fullmatch(r'[dD]\d+', p):
            continue
        cleaned.append(p)
    parts = cleaned

    s = "__".join(parts)
    # etiquetas “bonitas”
    s = re.sub(r'^pos__', 'POS__', s)
    s = re.sub(r'^sem__', 'SEM__', s)
    s = s.replace("drugs_fusion__", "DRUGS__")
    s = s.replace("diseases_dual__", "DISEASES__")
    s = s.replace("icd__", "ICD__")
    s = s.replace("smiles__", "SMILES__")
    s = s.replace("brief_title__", "TITLE__")
    s = s.replace("description__", "DESC__")

    s = s.replace("__", " · ")
    s = re.sub(r'\s*_svd[_\- ]?(\d+)', r' · SVD \1', s, flags=re.IGNORECASE)
    s = re.sub(r'(?<![A-Za-z0-9])d(\d+)(?![A-Za-z0-9])', r'comp \1', s)

    # dedup tokens finais e subsequências
    tokens = [t.strip() for t in s.split("·")]
    tokens = [t for t in tokens if t]
    tokens = _collapse_adjacent_dupes(tokens)
    tokens = _collapse_full_repeat(tokens)

    BLOCK_TAGS = {"POS", "SEM", "ICD", "DRUGS", "DISEASES", "SMILES", "TITLE", "DESC", "SVD"}
    pretty = []
    for t in tokens:
        tt = re.sub(r'\s{2,}', ' ', t.strip())
        if 1 <= len(tt) <= 3 and tt.isalpha() and tt.upper() not in BLOCK_TAGS and tt.islower():
            tt = f'"{tt}"'
        pretty.append(tt)
    out = " · ".join(pretty)
    return re.sub(r'\s{2,}', ' ', out).strip()

# ---------------- METRICS UTILS ----------------
def metrics_at_threshold(y_true, proba, thr=0.5):
    pred = (proba >= thr).astype(int)
    acc  = accuracy_score(y_true, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, pred, average="binary", zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
    tnr = tn / (tn + fp + 1e-9)
    bal = 0.5 * (tnr + rec)
    return dict(threshold=float(thr), accuracy=float(acc), precision=float(prec), recall=float(rec),
                f1=float(f1), tnr=float(tnr), bal_acc=float(bal),
                cm=dict(tn=int(tn), fp=int(fp), fn=int(fn), tp=int(tp)))

def pick_threshold_balacc_max(y_true, proba):
    fpr, tpr, thr = roc_curve(y_true, proba)
    candidates = np.unique(np.r_[thr, 0.5])
    def balacc_at(t):
        pred=(proba>=t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
        tnr = tn/(tn+fp+1e-9); tpr_=tp/(tp+fn+1e-9)
        return 0.5*(tnr+tpr_)
    scores = np.array([balacc_at(t) for t in candidates])
    i = int(np.argmax(scores))
    return float(candidates[i]), float(scores[i])

def best_f1_from_pr(y_true, proba):
    prec, rec, ths = precision_recall_curve(y_true, proba)
    f1s = 2 * prec * rec / (prec + rec + 1e-9)
    core = f1s[:-1]
    if core.size == 0: return float("nan")
    mx = np.nanmax(core)
    return float(mx) if np.isfinite(mx) else float("nan")

# ---------------- FIGURE HELPERS ----------------
def _savefig(fig, path: Path):
    fig.savefig(path, dpi=150, bbox_inches="tight")
    plt.close(fig)

# ---------------- BUILDERS ----------------
def build_summary_tables():
    # PR/AP, F1 best, ROC — para CTOD e HINT
    rows_ctod, rows_hint = [], []
    for phase in PHASES:
        mdl = load_hint_model(phase)

        Xc, yc = load_ctod_test(phase)
        pc = mdl.predict_proba(Xc)[:, 1]
        rows_ctod.append({"Phase": phase,
                          "PR (AP)": average_precision_score(yc, pc),
                          "F1 (Best)": best_f1_from_pr(yc, pc),
                          "ROC AUC": roc_auc_score(yc, pc)})

        Xh, yh = load_hint_test(phase)
        ph = mdl.predict_proba(Xh)[:, 1]
        rows_hint.append({"Phase": phase,
                          "PR (AP)": average_precision_score(yh, ph),
                          "F1 (Best)": best_f1_from_pr(yh, ph),
                          "ROC AUC": roc_auc_score(yh, ph)})

    df_ctod = pd.DataFrame(rows_ctod).set_index("Phase")
    df_hint = pd.DataFrame(rows_hint).set_index("Phase")
    combined = pd.concat({"CTOD": df_ctod, "HINT": df_hint}, axis=1)

    df_ctod.to_csv(TAB_DIR / "summary_ctod.csv")
    df_hint.to_csv(TAB_DIR / "summary_hint.csv")
    combined.to_csv(TAB_DIR / "summary_combined.csv")
    return df_ctod, df_hint, combined

def build_confusion_grid(thr=0.5):
    nrows, ncols = len(PHASES), 2
    fig, axes = plt.subplots(nrows, ncols, figsize=(11, 4.8 * nrows), sharex=True, sharey=True,
                             gridspec_kw={"wspace": -0.3, "hspace": 0.9})

    def _plot_cm(ax, y_true, proba, title):
        y_pred = (proba >= thr).astype(int)
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        tpr = tp / (tp + fn + 1e-9)
        tnr = tn / (tn + fp + 1e-9)
        prec = tp / (tp + fp + 1e-9)
        acc = accuracy_score(y_true, y_pred)
        bal = 0.5 * (tpr + tnr)

        ax.imshow(cm, cmap=plt.cm.Blues, alpha=0.6, interpolation="nearest")
        ax.set_title(title, fontsize=12, fontweight="bold", pad=8)
        ax.set_xlabel("Predicted"); ax.set_xticks([0,1]); ax.set_xticklabels(["0 (neg)","1 (pos)"])
        ax.set_yticks([0,1]); ax.set_yticklabels(["0 (neg)","1 (pos)"])
        labels = np.array([["TN","FP"],["FN","TP"]])
        for (r,c), val in np.ndenumerate(cm):
            ax.text(c, r, f"{labels[r,c]}\n{val:,}", ha="center", va="center", fontsize=11, fontweight="bold")

        ax.text(1.03, 0.5,
                f"Acc={acc:.3f}\nPrec={prec:.3f}\nRec={tpr:.3f}\nTNR={tnr:.3f}\nBal={bal:.3f}",
                ha="left", va="center", fontsize=10, transform=ax.transAxes,
                bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.85, linewidth=0))

        for s in ax.spines.values(): s.set_visible(False)
        ax.set_xticks(np.arange(-0.5, 2, 1), minor=True)
        ax.set_yticks(np.arange(-0.5, 2, 1), minor=True)
        ax.grid(which="minor", color="white", linestyle="-", linewidth=2)

    for i, phase in enumerate(PHASES):
        mdl = load_hint_model(phase)

        Xc, yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
        _plot_cm(axes[i,0], yc, pc, f"{phase.upper()} — CTOD")

        Xh, yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
        _plot_cm(axes[i,1], yh, ph, f"{phase.upper()} — HINT")

        axes[i,0].text(-0.8, 1.0, phase.upper(), va="bottom", ha="center",
                       rotation=90, fontsize=12, fontweight="bold", transform=axes[i,0].transAxes)

    axes[0,0].set_title("CTOD (HINT → CTOD)", fontsize=13, fontweight="bold")
    axes[0,1].set_title("HINT (HINT → HINT)", fontsize=13, fontweight="bold")
    fig.suptitle("Confusion Matrices by Phase and Dataset", fontsize=15, fontweight="bold", y=0.98)
    _savefig(fig, FIG_DIR / "confusion_grid.png")

def build_roc_grid():
    fig, axes = plt.subplots(len(PHASES), 2, figsize=(12, 12), sharex=True, sharey=True)
    for i, phase in enumerate(PHASES):
        mdl = load_hint_model(phase)

        # CTOD
        ax = axes[i,0]; Xc,yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
        fpr, tpr, _ = roc_curve(yc, pc); auc = roc_auc_score(yc, pc)
        ax.plot(fpr, tpr, label=f"AUC = {auc:.3f}", linewidth=2, color="#ff7f0e")
        ax.plot([0,1],[0,1],'--', color="gray", linewidth=1); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == len(PHASES)-1: ax.set_xlabel("False Positive Rate")
        ax.set_ylabel(f"{phase}\nTrue Positive Rate (Recall)")
        ax.legend(loc="lower right", fontsize=9)

        # HINT
        ax = axes[i,1]; Xh,yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
        fpr, tpr, _ = roc_curve(yh, ph); auc = roc_auc_score(yh, ph)
        ax.plot(fpr, tpr, label=f"AUC = {auc:.3f}", linewidth=2, color="#1f77b4")
        ax.plot([0,1],[0,1],'--', color="gray", linewidth=1); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == len(PHASES)-1: ax.set_xlabel("False Positive Rate")
        ax.legend(loc="lower right", fontsize=9)

    axes[0,0].set_title("HINT → CTOD", fontsize=13, fontweight="bold")
    axes[0,1].set_title("HINT → HINT", fontsize=13, fontweight="bold")
    fig.suptitle("ROC Curves by Phase and Dataset", fontsize=15, fontweight="bold", y=1.02)
    _savefig(fig, FIG_DIR / "roc_grid.png")

def build_pr_grid():
    fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)

    # Row 1: CTOD
    for i, phase in enumerate(PHASES):
        ax = axes[0,i]
        mdl = load_hint_model(phase); Xc,yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
        prec, rec, _ = precision_recall_curve(yc, pc); ap = average_precision_score(yc, pc)
        ax.plot(rec, prec, label=f"AP = {ap:.3f}", linewidth=2)
        ax.set_title(f"{phase.upper()}", fontsize=12); ax.legend(loc="lower left", fontsize=9); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == 0:
            ax.set_ylabel("Precision")
            ax.text(-0.3, 0.5, "CTOD", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    # Row 2: HINT
    for i, phase in enumerate(PHASES):
        ax = axes[1,i]
        mdl = load_hint_model(phase); Xh,yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
        prec, rec, _ = precision_recall_curve(yh, ph); ap = average_precision_score(yh, ph)
        ax.plot(rec, prec, label=f"AP = {ap:.3f}", linewidth=2, color="#1f77b4")
        ax.set_xlabel("Recall"); ax.legend(loc="lower left", fontsize=9); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == 0:
            ax.set_ylabel("Precision")
            ax.text(-0.3, 0.5, "HINT", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    fig.suptitle("Precision–Recall — HINT→CTOD (top) & HINT→HINT (bottom)", fontsize=14, fontweight="bold", y=1.02)
    _savefig(fig, FIG_DIR / "pr_grid.png")

def build_calibration_grid():
    fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)

    # Row 1: CTOD
    for i, phase in enumerate(PHASES):
        ax = axes[0,i]
        mdl = load_hint_model(phase); Xc,yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
        brier = brier_score_loss(yc, pc)
        prob_true, prob_pred = calibration_curve(yc, pc, n_bins=10, strategy="quantile")
        ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=f"Brier = {brier:.3f}")
        ax.plot([0,1],[0,1],'--', linewidth=1, color="gray"); ax.set_title(f"{phase.upper()}", fontsize=12)
        ax.legend(loc="upper left", fontsize=9); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == 0:
            ax.set_ylabel("Observed frequency")
            ax.text(-0.3, 0.5, "CTOD", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    # Row 2: HINT
    for i, phase in enumerate(PHASES):
        ax = axes[1,i]
        mdl = load_hint_model(phase); Xh,yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
        brier = brier_score_loss(yh, ph)
        prob_true, prob_pred = calibration_curve(yh, ph, n_bins=10, strategy="quantile")
        ax.plot(prob_pred, prob_true, marker='o', linewidth=2, label=f"Brier = {brier:.3f}", color="#1f77b4")
        ax.plot([0,1],[0,1],'--', linewidth=1, color="gray"); ax.set_xlabel("Predicted probability")
        ax.legend(loc="upper left", fontsize=9); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == 0:
            ax.set_ylabel("Observed frequency")
            ax.text(-0.3, 0.5, "HINT", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    fig.suptitle("Calibration — HINT→CTOD (top) & HINT→HINT (bottom)", fontsize=14, fontweight="bold", y=1.02)
    _savefig(fig, FIG_DIR / "calibration_grid.png")

def build_threshold_sweep_grid():
    fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharex=True, sharey=True)

    # Row 1: CTOD
    for i, phase in enumerate(PHASES):
        ax = axes[0,i]
        mdl = load_hint_model(phase); Xc,yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
        ths = np.linspace(0.01, 0.99, 99)
        bal, tnr_list, tpr_list = [], [], []
        for t in ths:
            pred = (pc >= t).astype(int)
            tn, fp, fn, tp = confusion_matrix(yc, pred).ravel()
            tnr = tn / (tn + fp + 1e-9); tpr = tp / (tp + fn + 1e-9)
            tnr_list.append(tnr); tpr_list.append(tpr); bal.append(0.5*(tnr+tpr))
        t_bal = float(ths[np.argmax(bal)])
        ax.plot(ths, bal, label="Balanced Acc", linewidth=2)
        ax.plot(ths, tnr_list, linestyle="--", label="TNR (Spec.)")
        ax.plot(ths, tpr_list, linestyle="--", label="TPR (Recall)")
        ax.axvline(t_bal, linestyle=":", color="black", label=f"BalAcc-max @ {t_bal:.3f}")
        ax.set_title(f"{phase.upper()}"); ax.legend(fontsize=8); ax.grid(True, linestyle=":", linewidth=0.5)
        if i == 0:
            ax.set_ylabel("Score")
            ax.text(-0.25, 0.5, "CTOD", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    # Row 2: HINT
    for i, phase in enumerate(PHASES):
        ax = axes[1,i]
        mdl = load_hint_model(phase); Xh,yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
        ths = np.linspace(0.01, 0.99, 99)
        bal, tnr_list, tpr_list = [], [], []
        for t in ths:
            pred = (ph >= t).astype(int)
            tn, fp, fn, tp = confusion_matrix(yh, pred).ravel()
            tnr = tn / (tn + fp + 1e-9); tpr = tp / (tp + fn + 1e-9)
            tnr_list.append(tnr); tpr_list.append(tpr); bal.append(0.5*(tnr+tpr))
        t_bal = float(ths[np.argmax(bal)])
        ax.plot(ths, bal, label="Balanced Acc", linewidth=2, color="#1f77b4")
        ax.plot(ths, tnr_list, linestyle="--", label="TNR (Spec.)")
        ax.plot(ths, tpr_list, linestyle="--", label="TPR (Recall)")
        ax.axvline(t_bal, linestyle=":", color="black", label=f"BalAcc-max @ {t_bal:.3f}")
        ax.set_xlabel("Threshold")
        if i == 0:
            ax.set_ylabel("Score")
            ax.text(-0.25, 0.5, "HINT", va="center", ha="center", rotation=90, fontsize=12, fontweight="bold", transform=ax.transAxes)

    fig.suptitle("Threshold Sweep — HINT→CTOD (top) & HINT→HINT (bottom)", fontsize=14, fontweight="bold", y=1.02)
    _savefig(fig, FIG_DIR / "threshold_sweep_grid.png")

def feature_importance_plots():
    # Top features por ganho (normalizado) com nomes legíveis
    for phase in PHASES:
        mdl = load_hint_model(phase); booster = mdl.get_booster()
        feat_names = load_feature_names(phase)

        gain_scores   = booster.get_score(importance_type="gain")
        cover_scores  = booster.get_score(importance_type="cover")
        weight_scores = booster.get_score(importance_type="weight")
        if not gain_scores:
            print(f"[{phase}] No importance available."); continue

        # usar união de chaves (evita perder features que só têm cover/weight)
        all_keys = set(gain_scores) | set(cover_scores) | set(weight_scores)
        df = pd.DataFrame({"Feature": list(all_keys)})
        df["Gain"]   = df["Feature"].map(lambda f: gain_scores.get(f, 0.0))
        df["Cover"]  = df["Feature"].map(lambda f: cover_scores.get(f, 0.0))
        df["Weight"] = df["Feature"].map(lambda f: weight_scores.get(f, 0.0))

        # mapear para nome legível e limpar
        df["ReadableFeature_raw"] = df["Feature"].map(lambda f: map_f_to_name(f, feat_names))
        df["ReadableFeature"]     = df["ReadableFeature_raw"].map(clean_readable)

        total_gain = df["Gain"].sum()
        df["Gain_norm"] = df["Gain"] / total_gain if total_gain > 0 else 0.0
        df = df.sort_values("Gain", ascending=False).reset_index(drop=True)
        df.to_csv(TAB_DIR / f"feature_importance_{phase}.csv", index=False)

        top = df.head(15)
        fig = plt.figure(figsize=(9, 6))
        plt.barh(range(len(top)), top["Gain_norm"][::-1])
        plt.yticks(range(len(top)), top["ReadableFeature"][::-1], fontsize=9)
        plt.xlabel("Normalized Gain"); plt.title(f"Feature Importance — {phase.upper()}")
        plt.grid(True, axis="x", linestyle=":", linewidth=0.5)
        _savefig(fig, FIG_DIR / f"feature_importance_{phase}.png")

# --------- Agregação por COLUNA ORIGINAL (com renames pedidos) ---------
HEAD_WIDTH = {
    "pos": 2,
    "sem": 2,
    "diseases_dual": 1,
    "drugs_fusion": 1,
    "icd": 1,
    "smiles": 1,
    "brief_title": 1,
    "description": 1,
    "title": 1,
    "desc": 1,
    "num": 1,  # ex.: num__enrollment
}

def extract_origin_column(raw_name: str) -> str:
    """
    A partir do nome guardado (após SVD), extrai a 'coluna origem' e aplica renames:
      - sem__criteria* -> llm__criteria
      - sem__smiles*   -> llm__smiles
      - num*           -> enrollment
    """
    s = _strip_leading_index(str(raw_name)).strip().strip('"')
    parts = [p for p in s.split("__") if p]
    parts = _collapse_adjacent_dupes(parts)
    parts = _collapse_full_repeat(parts)

    # remove marcadores de SVD e componentes dN
    cleaned = []
    for p in parts:
        if SVD_TOKENS_RE.match(p):
            continue
        if re.fullmatch(r'[dD]\d+', p):
            continue
        cleaned.append(p)
    parts = cleaned

    if not parts:
        return str(raw_name)

    head = parts[0].lower()
    if head in HEAD_WIDTH:
        take = min(HEAD_WIDTH[head], len(parts))
        origin = "__".join(parts[:take])
    else:
        origin = parts[0]

    # Renomeações para visualização/CSV
    if origin.startswith("sem__criteria"):
        origin = "llm__criteria"
    elif origin.startswith("sem__smiles"):
        origin = "llm__smiles"
    elif origin.startswith("num"):
        origin = "enrollment"

    return origin

def feature_importance_by_column():
    for phase in PHASES:
        mdl = load_hint_model(phase); booster = mdl.get_booster()
        feat_names = load_feature_names(phase)
        imp_gain = booster.get_score(importance_type="gain")
        cover    = booster.get_score(importance_type="cover")
        weight   = booster.get_score(importance_type="weight")

        all_keys = set(imp_gain) | set(cover) | set(weight)
        rows = []
        for fkey in all_keys:
            idx = int(fkey[1:]) if fkey.startswith("f") else None
            readable = feat_names[idx] if (idx is not None and feat_names and idx < len(feat_names)) else fkey
            col = extract_origin_column(readable)
            rows.append({
                "BoosterKey": fkey,
                "Feature": readable,
                "OriginColumn": col,
                "Gain": float(imp_gain.get(fkey, 0.0)),
                "Cover": float(cover.get(fkey, 0.0)),
                "Weight": float(weight.get(fkey, 0.0)),
            })
        df_feat = pd.DataFrame(rows)
        if df_feat.empty:
            print(f"[{phase}] No importance by column."); 
            continue

        agg = (df_feat.groupby("OriginColumn", as_index=False)
               .agg(Gain=("Gain","sum"), Cover=("Cover","sum"), Weight=("Weight","sum"), Features=("BoosterKey","count"))
               .sort_values("Gain", ascending=False).reset_index(drop=True))
        total_gain = agg["Gain"].sum()
        agg["Gain_norm"] = agg["Gain"] / total_gain if total_gain > 0 else 0.0
        agg["Gain_pct"]  = agg["Gain_norm"] * 100.0

        agg.to_csv(TAB_DIR / f"feature_importance_by_column_{phase}.csv", index=False)

        top = agg.head(20)
        fig = plt.figure(figsize=(8, 0.35*len(top) + 2))
        plt.barh(range(len(top)), top["Gain_norm"][::-1])
        plt.yticks(range(len(top)), top["OriginColumn"][::-1], fontsize=10)
        plt.xlabel("Normalized importance (gain)")
        plt.title(f"Feature importance by ORIGINAL COLUMN — {phase.upper()}")
        plt.grid(True, axis="x", linestyle=":", linewidth=0.5)
        _savefig(fig, FIG_DIR / f"feature_importance_by_column_{phase}.png")

# ---------------- HTML ----------------
def build_html_report(df_ctod, df_hint, df_combined, quick_tables_files):
    def _tbl(df: pd.DataFrame, title: str) -> str:
        try:
            df_disp = df.copy()
            # arredondar apenas colunas numéricas
            num_cols = df_disp.select_dtypes(include=[np.number]).columns
            df_disp[num_cols] = df_disp[num_cols].round(4)
            return f"<h3>{title}</h3>\n" + df_disp.to_html(classes='table', border=0)
        except Exception as e:
            return f"<h3>{title}</h3>\n<div class='caption' style='color:#b00'>[table render error: {e}]</div>"

    def inline_img(png_path: Path) -> str:
        if not png_path.exists():
            return f"<div class='caption' style='color:#b00'>[missing image: {png_path.name}]</div>"
        b64 = base64.b64encode(png_path.read_bytes()).decode("ascii")
        return f"<img src='data:image/png;base64,{b64}' style='max-width:100%'/>"

    phases_str = ", ".join(PHASES)

    head_tpl = Template(r"""
<!DOCTYPE html>
<html lang="en"><head>
<meta charset="utf-8"/>
<title>Performance Evaluation Report</title>
<style>
body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Arial,sans-serif; margin:24px; line-height:1.45}
h1,h2,h3{margin-top:1.2em}
.figure{border:1px solid #eee; padding:12px; border-radius:8px; background:#fafafa}
.caption{font-size:0.9em; color:#555; margin-top:6px}
.table{border-collapse:collapse}
.table th,.table td{padding:6px 10px; border-bottom:1px solid #eee; text-align:right}
.table th:first-child,.table td:first-child{text-align:left}
</style>
</head><body>
<h1>Performance Evaluation Report</h1>
<p><b>Pipelines:</b> HINT→CTOD e HINT→HINT | Fases: ${phases}</p>
""")

    html = []
    html.append(head_tpl.substitute(phases=phases_str))

    # 1) Sumários
    html.append("<h2>1) Summary Metrics</h2>")
    html.append(_tbl(df_combined, "CTOD vs HINT (AP, F1-best, ROC AUC)"))

    # 2) Tabelas por fase
    html.append("<h2>2) Threshold (0.5 vs Tuned)</h2>")
    for phase in PHASES:
        csv_path = quick_tables_files.get(phase)
        if not csv_path or not Path(csv_path).exists():
            html.append(f"<div class='caption' style='color:#b00'>[missing table: quick_metrics_{phase}.csv]</div>")
            continue
        try:
            df = pd.read_csv(csv_path, header=0, index_col=0)
        except Exception as e:
            html.append(f"<div class='caption' style='color:#b00'>[error reading {csv_path}: {e}]</div>")
            continue
        html.append(_tbl(df, f"Phase {phase.upper()} — CTOD & HINT"))

    # helper para secções com figura + legenda
    def fig_card(png_name: str, caption: str) -> str:
        return f"""
<div class="figure">
  {inline_img(FIG_DIR / png_name)}
  <div class="caption">{caption}</div>
</div>
"""

    # 3) ROC / PR
    html.append("<h2>3) ROC / PR</h2>")
    html.append(fig_card("roc_grid.png", "ROC curves by phase (esq: CTOD, dir: HINT)"))
    html.append(fig_card("pr_grid.png", "Precision–Recall — topo: CTOD, baixo: HINT"))

    # 4) Confusion & Calibration
    html.append("<h2>4) Confusion & Calibration</h2>")
    html.append(fig_card("confusion_grid.png", "Confusion matrices (thr=0.5)"))
    html.append(fig_card("calibration_grid.png", "Calibration curves (Brier score em legenda)"))

    # 5) Threshold Sweep
    html.append("<h2>5) Threshold Sweep</h2>")
    html.append(fig_card("threshold_sweep_grid.png", "BalancedAcc / TNR / TPR vs threshold"))

    # 6) Feature Importance
    html.append("<h2>6) Feature Importance</h2>")
    for phase in PHASES:
        html.append(fig_card(f"feature_importance_{phase}.png", f"Top features — {phase.upper()} (gain normalizado)"))
        html.append(fig_card(f"feature_importance_by_column_{phase}.png", f"Por coluna original — {phase.upper()}"))

    html.append("""
<hr/>
<p><small>Gerado automaticamente por unified_performance_report.py</small></p>
</body></html>""")

    (OUT_BASE / "performance_report.html").write_text("\n".join(html), encoding="utf-8")
    print(f"✔ Report HTML salvo em: {(OUT_BASE / 'performance_report.html').resolve()}")

# ---------------- MAIN ----------------
if __name__ == "__main__":
    print("==> Building summary tables...")
    df_ctod, df_hint, df_combined = build_summary_tables()

    print("==> Building figures...")
    build_roc_grid()
    build_pr_grid()
    build_confusion_grid(thr=0.5)
    build_calibration_grid()
    build_threshold_sweep_grid()

    print("==> Building quick metric tables...")
    qtbl_files = {}
    qtbl = {}
    # Guardar as tabelas por fase como CSV e devolver paths
    def quick_metric_tables():
        def summary_table_from(m_default, m_tuned, tuned_thr):
            def row_from(m):
                return {"Threshold": m["threshold"], "Accuracy": m["accuracy"], "Precision": m["precision"],
                        "Recall (TPR)": m["recall"], "Specificity (TNR)": m["tnr"],
                        "BalancedAcc": m["bal_acc"], "F1": m["f1"],
                        "TP": m["cm"]["tp"], "FP": m["cm"]["fp"], "FN": m["cm"]["fn"], "TN": m["cm"]["tn"]}
            df = pd.DataFrame([row_from(m_default), row_from(m_tuned)],
                              index=["Default (0.5)", f"Tuned (BalAcc max @ {tuned_thr:.3f})"])
            delta = (df.iloc[1] - df.iloc[0]).to_frame().T
            delta.index = ["Δ (tuned − default)"]
            df = pd.concat([df, delta], axis=0)
            return df

        out = {}
        for phase in PHASES:
            mdl = load_hint_model(phase)

            # CTOD
            Xc,yc = load_ctod_test(phase); pc = mdl.predict_proba(Xc)[:,1]
            m05c = metrics_at_threshold(yc, pc, thr=0.5)
            thr_c, _ = pick_threshold_balacc_max(yc, pc)
            mtc = metrics_at_threshold(yc, pc, thr=thr_c)
            df_ctod = summary_table_from(m05c, mtc, thr_c)

            # HINT
            Xh,yh = load_hint_test(phase); ph = mdl.predict_proba(Xh)[:,1]
            m05h = metrics_at_threshold(yh, ph, thr=0.5)
            thr_h, _ = pick_threshold_balacc_max(yh, ph)
            mth = metrics_at_threshold(yh, ph, thr=thr_h)
            df_hint = summary_table_from(m05h, mth, thr_h)

            df_both = pd.concat({"CTOD": df_ctod, "HINT": df_hint}, axis=1)
            out_csv = TAB_DIR / f"quick_metrics_{phase}.csv"
            df_both.to_csv(out_csv)
            out[phase] = out_csv
        return out

    qtbl_files = {k: str(v) for k, v in quick_metric_tables().items()}

    print("==> Feature importance...")
    feature_importance_plots()
    feature_importance_by_column()

    print("==> Assembling HTML report...")
    build_html_report(df_ctod, df_hint, df_combined, qtbl_files)

    print("\n✅ Done. Abra: report/performance_report.html")


==> Building summary tables...
==> Building figures...
==> Building quick metric tables...
==> Feature importance...
==> Assembling HTML report...
✔ Report HTML salvo em: /Users/antoniocortes/Tese/MyModel(hybrid)/report/performance_report.html

✅ Done. Abra: report/performance_report.html


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
