In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# =======================================================
# HINT — Train XGBoost per phase (lossguide, chunked)
# (aligned with latest Data-Transformation artifacts)
# With: TNR-targeted threshold (default) + Platt calibration
# =======================================================
# Trains on per-phase, SVD-reduced matrices saved by the builder:
#   /hint_xgb_artifacts/<phase>/
# Saves model packages to:
#   /xgb_model_package/<phase>/
#
# This version is reorganised to produce clean, well-structured
# console reports, suitable for screenshots in the thesis:
#  - Clear phase banners
#  - Compact model configuration blocks
#  - Validation summary tables
#  - TNR-constrained threshold summary
#  - Global cross-phase summary at the end
# =======================================================

from pathlib import Path
import os, sys, json, time, csv, math, signal, platform, shutil
from datetime import datetime
import numpy as np
import pandas as pd
from scipy import sparse as sp
import joblib

from sklearn.metrics import (
    classification_report, roc_auc_score, accuracy_score,
    precision_recall_fscore_support, average_precision_score,
    confusion_matrix, roc_curve
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import xgboost as xgb

# =======================================================
# GLOBAL CONFIG
# =======================================================

ARTIFACTS_ROOT = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/hint_xgb_artifacts")
MODEL_BASE     = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/xgb_model_package")

# (Optional) per-phase preprocessor saved by the builder
PREPROC_FILENAME_CANDIDATES = [
    "shared_preprocessor_hint_train.joblib",
    "shared_blocks_hint_train.joblib",
    "hint_preprocessor_train.joblib",
]

# XGBoost knobs (conservative & comparable across phases)
TOTAL_TREES   = 500
CHUNK_SIZE    = 50
LEARNING_RATE = 0.03
MAX_LEAVES    = 64
SUBSAMPLE     = 0.8
COLSAMPLE     = 0.7
REG_L2        = 2.0
MIN_CHILD_W   = 12
GAMMA         = 1.0
MAX_BIN       = 256
N_JOBS        = -1
RANDOM_STATE  = 42

# Optional per-phase overrides (small, safe nudges)
PHASE_OVERRIDES = {
    # "phase_I":  {"min_child_weight": 12},
    # "phase_II": {"min_child_weight": 10},
    # "phase_III":{"min_child_weight": 14},
}

# Early stop on BalancedAcc plateau (per chunk)
BAL_PATIENCE           = 3
BAL_IMPROVE_MIN_DELTA  = 1e-4

# Class weighting
ALPHA_NEG   = 1.4

# Reporting threshold (now TNR-targeted by default)
THRESHOLD   = 0.5
TARGET_TNR  = 0.80   # <— enabled by default

# Phase tags (as built for HINT)
PHASES = ["phase_I", "phase_II", "phase_III"]

# Preferred feature-name files (in order)
FEATURE_NAME_PREFERENCE = [
    "feature_names_reduced_with_top_original.json",
    "feature_names_reduced_with_prefix.json",
    "feature_names_reduced.json",
    "feature_names.json",
]

# =======================================================
# PRETTY PRINT HELPERS (for thesis-friendly logs)
# =======================================================

def banner(title: str, char: str = "="):
    line = char * max(len(title) + 4, 50)
    print("\n" + line)
    print(f"  {title}")
    print(line + "\n")

def sub_banner(title: str, char: str = "-"):
    line = char * max(len(title) + 4, 40)
    print("\n" + line)
    print(f"  {title}")
    print(line + "\n")

def kv_block(pairs, indent: int = 2):
    pad = " " * indent
    max_key = max(len(k) for k, _ in pairs)
    for k, v in pairs:
        print(f"{pad}{k:<{max_key}} : {v}")
    print()

def metrics_table(title: str, rows, indent: int = 2):
    """
    rows: list of (label, value_str)
    """
    pad = " " * indent
    max_label = max(len(lbl) for lbl, _ in rows)
    print(f"{pad}{'-' * (len(title) + 4)}")
    print(f"{pad}{title}")
    print(f"{pad}{'-' * (len(title) + 4)}")
    for lbl, val in rows:
        print(f"{pad}{lbl:<{max_label}} : {val}")
    print()

# =======================================================
# UTILS
# =======================================================

def _print_tree(root: Path, title: str, only_phase: str | None = None):
    print(f"\n=== {title}: {root.resolve()} ===")
    if not root.exists():
        print("  (directory not found)")
        return
    total_files = 0
    total_bytes = 0
    phases = [only_phase] if only_phase else PHASES
    for phase in phases:
        phase_dir = root / phase
        if not phase_dir.exists():
            continue
        print(f"\n  ── Phase: {phase}  →  {phase_dir.resolve()}")
        for path in sorted(phase_dir.rglob("*")):
            if path.is_file():
                total_files += 1
                size = path.stat().st_size
                total_bytes += size
                mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat(
                    sep=' ', timespec='seconds'
                )
                rel = path.relative_to(root)
                print(f"     • {rel}  | {size/1024:.1f} KB  | mtime: {mtime}")
    print(f"\n  >> Summary for {title}: {total_files} files, {total_bytes/1024:.1f} KB total")

def _require(path: Path) -> Path:
    if not path.exists():
        raise FileNotFoundError(f"Required file not found: {path}")
    return path

def _find_first_existing(dir_path: Path, candidates: list[str]) -> Path | None:
    for name in candidates:
        p = dir_path / name
        if p.exists():
            return p
    return None

def pick_threshold_tnr_target(y_true, y_prob, target_tnr=0.80):
    fpr, tpr, thr = roc_curve(y_true, y_prob)
    tnr = 1.0 - fpr
    idx_ok = np.where(tnr >= target_tnr)[0]
    if len(idx_ok):
        i = idx_ok[-1]
        chosen = float(thr[i])
        note = f"TNR≥{target_tnr:.2f}"
    else:
        i = int(np.lexsort((-tpr, tnr))[-1])
        chosen = float(thr[i])
        note = f"TNR target {target_tnr:.2f} not reachable; picked best feasible (TNR={tnr[i]:.3f})"
    return chosen, dict(tpr=float(tpr[i]), tnr=float(tnr[i]), idx=int(i), note=note)

def _load_feature_names_with_fallback(phase_dir: Path, n_expected: int) -> list[str]:
    for fname in FEATURE_NAME_PREFERENCE:
        p = phase_dir / fname
        if p.exists():
            try:
                names = json.loads(p.read_text())
                if len(names) == n_expected:
                    print(f"  [features] using {fname}")
                    return names
            except Exception:
                pass
    print("  [features] no compatible name file found → using generic f0..fN")
    return [f"f{i}" for i in range(n_expected)]

def _copy_if_exists(src_dir: Path, dst_dir: Path, filenames: list[str]):
    dst_dir.mkdir(parents=True, exist_ok=True)
    for name in filenames:
        p = src_dir / name
        if p.exists():
            try:
                shutil.copy2(p, dst_dir / name)
            except Exception:
                pass

def _find_preprocessor(phase_dir: Path):
    return _find_first_existing(phase_dir, PREPROC_FILENAME_CANDIDATES)

def _load_reduced_split(phase_dir: Path, split: str) -> sp.csr_matrix:
    fname_candidates = [
        f"X_{split}_reduced.npz",
        f"X_{split[:3]}_reduced.npz",
    ]
    for cand in fname_candidates:
        p = phase_dir / cand
        if p.exists():
            return sp.load_npz(p)
    p_fallback = phase_dir / f"X_{split}.npz"
    if p_fallback.exists():
        return sp.load_npz(p_fallback)
    raise FileNotFoundError(f"Reduced matrix for split='{split}' not found in {phase_dir}")

def load_phase_data(phase: str):
    phase_dir = ARTIFACTS_ROOT / phase
    X_train = _load_reduced_split(phase_dir, "train")
    try:
        X_val = _load_reduced_split(phase_dir, "valid")
    except FileNotFoundError:
        X_val = _load_reduced_split(phase_dir, "val")

    y_train = np.load(_require(phase_dir / "y_train.npy"))
    y_val = None
    for cand in ["y_valid.npy", "y_val.npy"]:
        p = phase_dir / cand
        if p.exists():
            y_val = np.load(p)
            break
    if y_val is None:
        raise FileNotFoundError(f"No validation labels found in {phase_dir}")

    feature_names = _load_feature_names_with_fallback(phase_dir, X_train.shape[1])

    preproc_path = _find_preprocessor(phase_dir)
    preprocessor = None
    if preproc_path is not None:
        try:
            preprocessor = joblib.load(preproc_path)
        except Exception:
            preprocessor = None

    svd_meta = {}
    meta_file = phase_dir / "svd_meta.json"
    if meta_file.exists():
        try:
            svd_meta = json.loads(meta_file.read_text())
        except Exception:
            svd_meta = {}

    return X_train, X_val, y_train, y_val, feature_names, preprocessor, phase_dir, svd_meta

def pick_tree_method_safe():
    override = os.environ.get("XGB_TREE_METHOD")
    if override in {"hist", "approx", "auto", "exact", "gpu_hist"}:
        print(f"  [tree_method] using override: {override}")
        return override
    try:
        dm = xgb.DMatrix(sp.csr_matrix(np.array([[0.0]], dtype=np.float32)))
        xgb.train(
            params={"tree_method": "gpu_hist", "max_depth": 1, "verbosity": 0},
            dtrain=dm,
            num_boost_round=1,
        )
        print("  [tree_method] gpu_hist available.")
        return "gpu_hist"
    except Exception:
        print("  [tree_method] gpu_hist NOT available → using 'hist'.")
        return "hist"

# =======================================================
# TRAIN ONE PHASE
# =======================================================

def train_one_phase(phase: str):
    # --- Load data & artifacts ---
    X_train, X_val, y_train, y_val, feature_names, preprocessor, phase_dir, svd_meta = load_phase_data(phase)

    # --- Prepare model dir ---
    out_dir = MODEL_BASE / phase
    out_dir.mkdir(parents=True, exist_ok=True)
    chk_dir   = out_dir / "checkpoints"
    chk_dir.mkdir(parents=True, exist_ok=True)
    chk_path  = chk_dir / "xgb_chunk.json"
    best_path = chk_dir / "xgb_best.json"
    trace_json = chk_dir / "train_trace.json"
    trace_csv  = chk_dir / "train_trace.csv"
    stop_flag  = chk_dir / "STOP.flag"

    # --- Phase banner ---
    banner(f"PHASE {phase.replace('phase_', '').upper()} – HINT TRAINING & VALIDATION")

    # --- Print environment & data info ---
    kv_block([
        ("Python",        f"{sys.version.split()[0]} on {platform.system()} {platform.release()}"),
        ("XGBoost",       xgb.__version__),
        ("Train matrix",  f"{X_train.shape}"),
        ("Val matrix",    f"{X_val.shape}"),
        ("Effective dims", svd_meta.get("global_cap_effective", X_train.shape[1])),
    ])

    # --- Class-balanced sample weights (NEG boosted) ---
    classes = np.array([0, 1])
    class_w = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    w0, w1 = float(class_w[0]) * ALPHA_NEG, float(class_w[1])
    print("  [class weights]")
    kv_block([
        ("alpha_neg",       ALPHA_NEG),
        ("weight_neg (w0)", f"{w0:.3f}"),
        ("weight_pos (w1)", f"{w1:.3f}"),
    ])

    # --- CSV header for detailed trace (if needed later) ---
    if not trace_csv.exists():
        with open(trace_csv, "w", newline="") as f:
            csv.writer(f).writerow(
                ["chunk","trees","val_auc","val_pr_auc",
                 "acc","prec","rec","f1",
                 "cm_tn","cm_fp","cm_fn","cm_tp",
                 "auc_delta","elapsed_min","chunk_min","eta_total_min",
                 "note"]
            )

    # --- Choose tree_method ---
    tree_method = pick_tree_method_safe()

    # --- Per-phase overrides ---
    ov = PHASE_OVERRIDES.get(phase, {})

    # --- Model definition ---
    model = XGBClassifier(
        tree_method=tree_method,
        grow_policy="lossguide",
        max_leaves=MAX_LEAVES,
        max_depth=4,
        n_estimators=0,
        learning_rate=LEARNING_RATE,
        subsample=SUBSAMPLE,
        colsample_bytree=COLSAMPLE,
        reg_lambda=REG_L2,
        reg_alpha=0.0,
        min_child_weight=ov.get("min_child_weight", MIN_CHILD_W),
        gamma=GAMMA,
        max_bin=MAX_BIN,
        objective="binary:logistic",
        eval_metric=["aucpr","auc"],
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        scale_pos_weight=1.0,
        max_delta_step=1,
    )

    # --- Print compact configuration block (good for thesis screenshot) ---
    sub_banner("XGBOOST CONFIGURATION (PHASE TEMPLATE)")
    kv_block([
        ("tree_method",       tree_method),
        ("grow_policy",       "lossguide"),
        ("max_depth",         4),
        ("max_leaves",        MAX_LEAVES),
        ("learning_rate",     LEARNING_RATE),
        ("total_trees",       TOTAL_TREES),
        ("chunk_size",        CHUNK_SIZE),
        ("subsample",         SUBSAMPLE),
        ("colsample_bytree",  COLSAMPLE),
        ("reg_lambda (L2)",   REG_L2),
        ("min_child_weight",  model.get_params()["min_child_weight"]),
        ("gamma",             GAMMA),
        ("max_bin",           MAX_BIN),
        ("bal_patience",      BAL_PATIENCE),
        ("target_TNR",        TARGET_TNR),
        ("random_state",      RANDOM_STATE),
    ])

    # --- Training loop (chunked) ---
    start_time = time.time()
    trained_trees = 0
    best_auc = -np.inf
    best_bal_acc = -np.inf
    best_trees_auc = 0
    best_trees_bal = 0
    last_auc = None
    chunk_index = 0
    bal_no_improve = 0
    trace = []

    def _elapsed_min():
        return (time.time() - start_time) / 60.0

    def _handle_sigint(sig, frame):
        print("\n Interrupt received. Will stop after this chunk (checkpoint saved).")
        raise KeyboardInterrupt()
    signal.signal(signal.SIGINT, _handle_sigint)

    sub_banner("INCREMENTAL TRAINING (CHUNKED)")
    print(f"  → total_trees={TOTAL_TREES}, chunk={CHUNK_SIZE}, max_leaves={MAX_LEAVES}, "
          f"lr={LEARNING_RATE}, method={tree_method}, min_child_weight={model.get_params()['min_child_weight']}")

    try:
        while trained_trees < TOTAL_TREES:
            if stop_flag.exists():
                print("  STOP.flag detected. Ending after this chunk.")
            next_target = min(trained_trees + CHUNK_SIZE, TOTAL_TREES)
            add_trees = next_target - trained_trees
            chunk_index += 1
            print(f"\n  Chunk {chunk_index:02d}: trees {trained_trees + 1}..{next_target} (add {add_trees})")

            t0 = time.time()
            model.set_params(n_estimators=next_target)
            model.fit(
                X_train, y_train,
                sample_weight=np.where(y_train == 0, w0, w1),
                eval_set=[(X_val, y_val)],
                verbose=False
            )

            # Save checkpoint
            model.get_booster().save_model(str(chk_path))
            trained_trees = next_target

            # ---- Per-chunk validation metrics (raw scores) ----
            proba_val_raw = model.predict_proba(X_val)[:, 1]
            pred_val_raw  = (proba_val_raw >= THRESHOLD).astype(int)

            auc  = roc_auc_score(y_val, proba_val_raw)
            ap   = average_precision_score(y_val, proba_val_raw)
            acc  = accuracy_score(y_val, pred_val_raw)
            prec, rec, f1, _ = precision_recall_fscore_support(
                y_val, pred_val_raw, average="binary", zero_division=0)

            tn, fp, fn, tp = confusion_matrix(y_val, pred_val_raw).ravel()
            tnr = tn / (tn + fp + 1e-9)
            tpr = tp / (tp + fn + 1e-9)
            bal_acc = 0.5 * (tpr + tnr)

            print(f"   • AUC={auc:.4f} | PR-AUC={ap:.4f} | TNR={tnr:.4f} | BalancedAcc={bal_acc:.4f}")

            # Track best-by-AUC
            auc_delta = auc - last_auc if last_auc is not None else None
            if auc > best_auc + 1e-9:
                best_auc = auc
                best_trees_auc = trained_trees
            last_auc = auc

            # Keep best-by-BalAcc
            if bal_acc > best_bal_acc + BAL_IMPROVE_MIN_DELTA:
                best_bal_acc = float(bal_acc)
                best_trees_bal = int(trained_trees)
                model.get_booster().save_model(str(best_path))
                print(f"   ✓ New BEST (BalancedAcc): {best_bal_acc:.4f} at trees={best_trees_bal}")
                bal_no_improve = 0
            else:
                bal_no_improve += 1

            chunk_min = (time.time() - t0) / 60.0
            elapsed_min = _elapsed_min()
            chunks_done = math.ceil(trained_trees / CHUNK_SIZE)
            chunks_total = math.ceil(TOTAL_TREES / CHUNK_SIZE)
            avg_per_chunk = (elapsed_min / max(1, chunks_done))
            eta_total_min = avg_per_chunk * chunks_total

            # log CSV/JSON
            entry = {
                "chunk": chunk_index, "trees": trained_trees,
                "val_auc": float(auc), "val_pr_auc": float(ap),
                "acc": float(acc), "prec": float(prec), "rec": float(rec), "f1": float(f1),
                "cm_tn": int(tn), "cm_fp": int(fp), "cm_fn": int(fn), "cm_tp": int(tp),
                "auc_delta": float(auc_delta) if auc_delta is not None else None,
                "elapsed_min": float(elapsed_min), "chunk_min": float(chunk_min),
                "eta_total_min": float(eta_total_min),
                "note": f"best_bal={best_bal_acc:.4f} at {best_trees_bal}"
            }
            trace.append(entry)
            with open(trace_json, "w") as f:
                json.dump(trace, f, indent=2)
            with open(trace_csv, "a", newline="") as f:
                csv.writer(f).writerow([
                    entry["chunk"], entry["trees"], entry["val_auc"], entry["val_pr_auc"],
                    entry["acc"], entry["prec"], entry["rec"], entry["f1"],
                    entry["cm_tn"], entry["cm_fp"], entry["cm_fn"], entry["cm_tp"],
                    entry["auc_delta"], entry["elapsed_min"], entry["chunk_min"], entry["eta_total_min"],
                    entry["note"]
                ])

            # plateau early-stop
            if BAL_PATIENCE and bal_no_improve >= BAL_PATIENCE:
                print(f"   Early stop: BalancedAcc plateau for {BAL_PATIENCE} chunks.")
                break

            if stop_flag.exists():
                print("   STOP.flag honored. Ending now (checkpoint saved).")
                break

        print("\n  Training loop finished.")

    except KeyboardInterrupt:
        print(f"\n  ⏸ Paused at trees={trained_trees}. Checkpoint at: {chk_path}")

    print()
    metrics_table(
        "BEST CHECKPOINTS (VALIDATION – HINT)",
        [
            ("Best-by-AUC trees",     f"{best_trees_auc}"),
            ("Best Val AUC",          f"{best_auc:.5f}"),
            ("Best-by-BalAcc trees",  f"{best_trees_bal}"),
            ("Best Val BalancedAcc",  f"{best_bal_acc:.5f}"),
        ],
        indent=2,
    )
    print(f"   Checkpoints dir : {chk_dir}")

    # Load best booster (BalancedAcc) for final eval
    if best_path.exists():
        booster = xgb.Booster()
        booster.load_model(str(best_path))
        model._Booster = booster
        print(f"   Loaded best booster (BalancedAcc): {best_path}")

    # ---- Final eval @ calibrated scores ----
    sub_banner("VALIDATION SUMMARY – HINT (CALIBRATED PROBABILITIES)")

    proba_val_raw = model.predict_proba(X_val)[:, 1]

    # Post-hoc calibration (Platt) on validation
    platt = LogisticRegression(max_iter=1000)
    platt.fit(proba_val_raw.reshape(-1, 1), y_val)

    def apply_calibrated(p_raw: np.ndarray) -> np.ndarray:
        return platt.predict_proba(p_raw.reshape(-1, 1))[:, 1]

    proba_val_cal = apply_calibrated(proba_val_raw)

    # Use calibrated scores for final validation metrics @ default threshold
    proba_for_eval = proba_val_cal
    pred_val  = (proba_for_eval >= THRESHOLD).astype(int)

    auc  = roc_auc_score(y_val, proba_for_eval)
    ap   = average_precision_score(y_val, proba_for_eval)
    acc  = accuracy_score(y_val, pred_val)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_val, pred_val, average="binary", zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_val, pred_val).ravel()
    tnr = tn / (tn + fp + 1e-9)
    tpr = tp / (tp + fn + 1e-9)
    bal_acc = 0.5 * (tpr + tnr)

    # Compact summary block – PERFECT for a screenshot in 6.3/6.4
    metrics_table(
        f"VALIDATION @ THRESHOLD = 0.50 (HINT – {phase.replace('phase_', 'Phase ')})",
        [
            ("AUROC",        f"{auc:.4f}"),
            ("AUPRC",        f"{ap:.4f}"),
            ("Accuracy",     f"{acc:.4f}"),
            ("Precision",    f"{prec:.4f}"),
            ("Recall (TPR)", f"{rec:.4f}"),
            ("F1",           f"{f1:.4f}"),
            ("TNR",          f"{tnr:.4f}"),
            ("BalancedAcc",  f"{bal_acc:.4f}"),
            ("TN, FP",       f"{tn} , {fp}"),
            ("FN, TP",       f"{fn} , {tp}"),
        ],
        indent=2,
    )

    print("  Classification report (HINT validation, calibrated, thr=0.5):")
    print("  " + classification_report(y_val, pred_val, digits=4).replace("\n", "\n  "))

    # Optional (default ON): tune threshold to TNR target using calibrated scores
    tuned = None
    if TARGET_TNR is not None:
        thr, info = pick_threshold_tnr_target(y_val, proba_for_eval, target_tnr=TARGET_TNR)
        pred_tuned = (proba_for_eval >= thr).astype(int)
        tn2, fp2, fn2, tp2 = confusion_matrix(y_val, pred_tuned).ravel()
        tpr_ = tp2 / (tp2 + fn2 + 1e-9)
        tnr_ = tn2 / (tn2 + fp2 + 1e-9)
        bal_acc_ = 0.5 * (tpr_ + tnr_)
        ap_pos = average_precision_score(y_val, proba_for_eval)
        ap_neg = average_precision_score(y_val, 1 - proba_for_eval, pos_label=0)

        sub_banner("VALIDATION @ TNR-CONSTRAINED THRESHOLD (HINT)")

        metrics_table(
            f"TNR TARGET = {TARGET_TNR:.2f} (HINT – {phase.replace('phase_', 'Phase ')})",
            [
                ("Strategy note",   info["note"]),
                ("Chosen thr",      f"{thr:.6f}"),
                ("TNR",             f"{tnr_:.4f}"),
                ("TPR",             f"{tpr_:.4f}"),
                ("BalancedAcc",     f"{bal_acc_:.4f}"),
                ("AP (positive)",   f"{ap_pos:.4f}"),
                ("AP (negative)",   f"{ap_neg:.4f}"),
                ("TN, FP",          f"{tn2} , {fp2}"),
                ("FN, TP",          f"{fn2} , {tp2}"),
            ],
            indent=2,
        )

        tuned = {"threshold": float(thr), **info, "bal_acc": float(bal_acc_)}

    # --- Save package per phase ---
    sub_banner("SAVING MODEL PACKAGE")
    print("  Saving model package to disk...")

    out_dir.mkdir(parents=True, exist_ok=True)

    # Save sklearn wrapper
    joblib.dump(model, out_dir / "hint_xgb_model.joblib")
    # Save booster JSON
    model.save_model(str(out_dir / "xgb_model.json"))
    # Save calibrator
    joblib.dump(platt, out_dir / "prob_calibrator.joblib")

    # Save pipeline (preprocessor + model) if available (for provenance)
    if preprocessor is not None:
        joblib.dump(
            {"preprocessor": preprocessor, "model": model, "calibrator": platt},
            out_dir / "hint_pipeline_full.joblib"
        )

    # Save feature importance by gain (use reduced feature names)
    booster = model.get_booster()
    importance = booster.get_score(importance_type="gain")
    feat_imp = {}
    for k, v in importance.items():
        try:
            idx = int(k[1:])  # f0, f1, ...
            fname = feature_names[idx] if idx < len(feature_names) else k
        except Exception:
            fname = k
        feat_imp[fname] = float(v)
    with open(out_dir / "feature_importance_gain.json", "w") as f:
        json.dump(feat_imp, f, indent=2)

    # Write feature-name files into the model package dir (copy any that exist)
    name_files_to_copy = [
        "feature_names_reduced_with_top_original.json",
        "feature_names_reduced_with_prefix.json",
        "feature_names_reduced.json",
        "feature_names.json",
        "svd_meta.json",
        "preprocessor_meta.json",
    ]
    _copy_if_exists(phase_dir, out_dir, name_files_to_copy)

    # Save evals_result & metadata
    try:
        evals_result = model.evals_result()
    except Exception:
        evals_result = {}
    with open(out_dir / "evals_result.json", "w") as f:
        json.dump(evals_result, f, indent=2)

    meta = {
        "phase": phase,
        "framework": "xgboost",
        "growth_policy": "lossguide",
        "n_features": int(X_train.shape[1]),
        "n_train_samples": int(X_train.shape[0]),
        "n_val_samples": int(X_val.shape[0]),
        "random_state": RANDOM_STATE,
        "class_weighting": {"alpha_neg": ALPHA_NEG, "computed_w0": float(w0), "computed_w1": float(w1)},
        "best_chunk_trees_auc": int(best_trees_auc),
        "best_val_auc": float(best_auc),
        "best_chunk_trees_bal": int(best_trees_bal),
        "best_val_bal_acc": float(best_bal_acc),
        "training": {
            "total_trees": TOTAL_TREES,
            "chunk_size": CHUNK_SIZE,
            "learning_rate": LEARNING_RATE,
            "max_leaves": MAX_LEAVES,
            "subsample": SUBSAMPLE,
            "colsample_bytree": COLSAMPLE,
            "reg_lambda": REG_L2,
            "reg_alpha": 0.0,
            "min_child_weight": int(model.get_params()["min_child_weight"]),
            "gamma": GAMMA,
            "max_bin": MAX_BIN,
            "n_jobs": N_JOBS,
            "bal_patience": BAL_PATIENCE,
            "target_tnr": TARGET_TNR,
        },
        "artifacts_used": {
            "data_dir": str(phase_dir),
            "feature_names_source": "preferred by order: " + " → ".join(FEATURE_NAME_PREFERENCE),
            "best_booster_json": str(best_path.relative_to(out_dir)) if best_path.exists() else None,
            "last_checkpoint_json": str(chk_path.relative_to(out_dir)) if chk_path.exists() else None,
            "trace_json": str(trace_json.relative_to(out_dir)),
            "trace_csv": str(trace_csv.relative_to(out_dir)),
        },
        "inference_threshold": (
            {"strategy": "tnr_target", **tuned} if tuned is not None
            else {"strategy": "fixed", "threshold": THRESHOLD}
        ),
        "calibration": {"type": "platt", "fitted_on": "validation", "file": "prob_calibrator.joblib"},
        "files": {
            "sklearn_joblib": "hint_xgb_model.joblib",
            "xgb_json": "xgb_model.json",
            "pipeline_full": "hint_pipeline_full.joblib" if preprocessor is not None else None,
            "feature_importance_gain": "feature_importance_gain.json",
            "evals_result": "evals_result.json",
            "model_metadata": "model_metadata.json",
            "prob_calibrator": "prob_calibrator.joblib",
        },
    }
    with open(out_dir / "model_metadata.json", "w") as f:
        json.dump(meta, f, indent=2)

    print(f"\n  Saved phase package to: {out_dir}")
    _print_tree(MODEL_BASE, f"Model package artifacts (phase: {phase})", only_phase=phase)

    return {
        "phase": phase,
        "best_auc": best_auc,
        "best_bal_acc": best_bal_acc,
        "val_auc_calibrated": auc,
        "val_balacc_calibrated": bal_acc,
        "train_shape": tuple(X_train.shape),
        "val_shape": tuple(X_val.shape),
        "out_dir": str(out_dir),
        "tnr_thr05": tnr,
    }

# =======================================================
# MAIN
# =======================================================

if __name__ == "__main__":
    results = []
    for phase in PHASES:
        try:
            results.append(train_one_phase(phase))
        except Exception as e:
            print(f"\n[!] Skipped {phase} due to error: {e}")

    if results:
        banner("GLOBAL SUMMARY ACROSS PHASES (HINT VALIDATION)")
        # Nice compact cross-phase table
        header = (
            "Phase", "AUC(best chunk)", "BalAcc(best chunk)", 
            "AUC(calibrated, thr=0.5)", "BalAcc(calibrated, thr=0.5)",
            "TNR(thr=0.5)", "Train shape", "Val shape"
        )
        print("  " + " | ".join(f"{h:^20}" for h in header))
        print("  " + "-" * (len(header) * 23))
        for r in results:
            row = [
                r["phase"].replace("phase_", "Phase "),
                f"{r['best_auc']:.4f}",
                f"{r['best_bal_acc']:.4f}",
                f"{r['val_auc_calibrated']:.4f}",
                f"{r['val_balacc_calibrated']:.4f}",
                f"{r['tnr_thr05']:.4f}",
                f"{r['train_shape']}",
                f"{r['val_shape']}",
            ]
            print("  " + " | ".join(f"{c:^20}" for c in row))
        print()
        _print_tree(MODEL_BASE, "All model package artifacts")
    else:
        print("\nNo phase was trained. Check errors above.")


  [features] using feature_names_reduced_with_top_original.json

  PHASE I – HINT TRAINING & VALIDATION

  Python         : 3.11.11 on Darwin 22.6.0
  XGBoost        : 3.0.5
  Train matrix   : (5417, 1000)
  Val matrix     : (1159, 1000)
  Effective dims : 1000

  [class weights]
  alpha_neg       : 1.4
  weight_neg (w0) : 4.880
  weight_pos (w1) : 0.584

  [tree_method] gpu_hist NOT available → using 'hist'.

------------------------------------------
  XGBOOST CONFIGURATION (PHASE TEMPLATE)
------------------------------------------

  tree_method      : hist
  grow_policy      : lossguide
  max_depth        : 4
  max_leaves       : 64
  learning_rate    : 0.03
  total_trees      : 500
  chunk_size       : 50
  subsample        : 0.8
  colsample_bytree : 0.7
  reg_lambda (L2)  : 2.0
  min_child_weight : 12
  gamma            : 1.0
  max_bin          : 256
  bal_patience     : 3
  target_TNR       : 0.8
  random_state     : 42


----------------------------------------
  INCREMENTAL T