In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, numpy as np, pandas as pd
np.random.seed(42)

BASE = "/content/drive/MyDrive/MLBA_Project"
RAW  = f"{BASE}/data/raw"
PRO  = f"{BASE}/data/processed"
MDIR = f"{BASE}/models"
RPT  = f"{BASE}/reports"
for d in (PRO, MDIR, RPT): os.makedirs(d, exist_ok=True)

In [None]:
feat = pd.read_csv(f"{PRO}/clean_with_features.csv", parse_dates=["date"])
feat.columns = feat.columns.str.lower()

mst = pd.read_csv(f"{PRO}/clean_master_union.csv", parse_dates=["date"])
mst.columns = mst.columns.str.lower()
mst = mst.sort_values(["fund_id","date"]).reset_index(drop=True)

H = 63
nav_fwd = mst.groupby("fund_id")["nav"].shift(-H)
mst["fwd_ret_63d_fund"] = nav_fwd / mst["nav"] - 1.0

tri_daily = mst[["date","tri"]].drop_duplicates().sort_values("date")
tri_daily["tri_fwd"] = tri_daily["tri"].shift(-H)
tri_daily["fwd_ret_63d_bmk"] = tri_daily["tri_fwd"] / tri_daily["tri"] - 1.0

mst = mst.merge(tri_daily[["date","fwd_ret_63d_bmk"]], on="date", how="left")
mst["y_excess_63d"] = mst["fwd_ret_63d_fund"] - mst["fwd_ret_63d_bmk"]

lab = mst[["date","fund_id","y_excess_63d"]].dropna().copy()
dfm = feat.merge(lab, on=["date","fund_id"], how="inner").sort_values(["fund_id","date"]).reset_index(drop=True)
dfm["y_bin"] = (dfm["y_excess_63d"] > 0).astype(int)

print("Rows:", len(dfm), "| Funds:", dfm.fund_id.nunique(),
      "| Range:", dfm.date.min().date(), "→", dfm.date.max().date(),
      "| Pos rate:", round(dfm["y_bin"].mean(), 3))

In [None]:
from sklearn.model_selection import TimeSeriesSplit

ban = {"date","fund_id","nav","tri","y_excess_63d","y_bin"}
feat_cols = [c for c in dfm.columns if (c not in ban) and pd.api.types.is_numeric_dtype(dfm[c])]

mask = dfm[feat_cols + ["y_bin"]].notna().all(axis=1)
dfm = dfm.loc[mask].reset_index(drop=True)

X = dfm[feat_cols].astype(float)
y = dfm["y_bin"].astype(int)
meta_idx = dfm[["date","fund_id"]].reset_index(drop=True)

print("Features:", len(feat_cols), "| Rows:", len(X))

tscv = TimeSeriesSplit(n_splits=5)

fold_rows = []
for k, (tr, te) in enumerate(tscv.split(X), 1):
    dtr = meta_idx.iloc[tr]["date"]; dte = meta_idx.iloc[te]["date"]
    fold_rows.append({
        "fold": k,
        "train_start": str(dtr.min().date()), "train_end": str(dtr.max().date()),
        "test_start":  str(dte.min().date()), "test_end":  str(dte.max().date()),
        "train_n": len(tr), "test_n": len(te)
    })
split_table = pd.DataFrame(fold_rows)
split_table.to_csv(f"{PRO}/split_table_classification.csv", index=False)
split_table

In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import (
    average_precision_score, precision_recall_curve, confusion_matrix,
    precision_score, recall_score, f1_score
)

def save_fold_preds(model_name, fold_id, test_idx, scores, y_true_fold, out_dir=PRO):
    out = meta_idx.iloc[test_idx].copy()
    out["y_true"] = y_true_fold
    out[model_name] = scores
    out["fold"] = fold_id
    out.to_csv(f"{out_dir}/clf_fold_{model_name}_{fold_id}.csv", index=False)

def pr_auc_and_sweep(y_true, scores):
    pr_auc = average_precision_score(y_true, scores)
    prec, rec, th = precision_recall_curve(y_true, scores)
    sweep = pd.DataFrame({"threshold": list(th) + [np.inf], "precision": prec, "recall": rec})
    return pr_auc, sweep

def business_aware_threshold(y_true, scores, min_precision=0.65):
    prec, rec, th = precision_recall_curve(y_true, scores)
    for i in range(len(th)):
        if prec[i] >= min_precision and rec[i] >= 0.30:
            f1 = 2 * prec[i] * rec[i] / (prec[i] + rec[i] + 1e-12)
            return th[i], float(prec[i]), float(rec[i]), float(f1)
    f1_all = 2 * prec * rec / (prec + rec + 1e-12)
    idx = int(np.nanargmax(f1_all[:-1])) if len(f1_all) > 1 else 0
    th_fb = th[idx] if idx < len(th) else 0.5
    pred_fb = (scores >= th_fb).astype(int)
    return (float(th_fb),
            float(precision_score(y_true, pred_fb, zero_division=0)),
            float(recall_score(y_true, pred_fb, zero_division=0)),
            float(f1_score(y_true, pred_fb, zero_division=0)))

def cm_at_threshold(y_true, scores, th):
    yhat = (scores >= th).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
    return tn, fp, fn, tp

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

logit = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    LogisticRegression(max_iter=500, class_weight="balanced", random_state=42)
)

oof_logit = np.full(len(y), np.nan, dtype=float)
cv_rows = []

for k, (tr, te) in enumerate(tscv.split(X), 1):
    logit.fit(X.iloc[tr], y.iloc[tr])
    s = logit.predict_proba(X.iloc[te])[:,1]
    oof_logit[te] = s
    pr_auc, _ = pr_auc_and_sweep(y.iloc[te], s)
    cv_rows.append({"fold": k, "PR_AUC": pr_auc, "test_n": len(te)})
    save_fold_preds("logit", k, te, s, y.iloc[te].values)

cv_logit = pd.DataFrame(cv_rows)
cv_logit.to_csv(f"{PRO}/cv_logistic_folds.csv", index=False)
print("Logit mean PR-AUC:", round(cv_logit["PR_AUC"].mean(), 4))

In [None]:
!pip install -q xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=700, max_depth=4, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    random_state=42, n_jobs=-1, eval_metric="logloss",
)

oof_xgb = np.full(len(y), np.nan, dtype=float)
cv_rows = []

for k, (tr, te) in enumerate(tscv.split(X), 1):
    xgb.fit(X.iloc[tr], y.iloc[tr])
    s = xgb.predict_proba(X.iloc[te])[:,1]
    oof_xgb[te] = s
    pr_auc, _ = pr_auc_and_sweep(y.iloc[te], s)
    cv_rows.append({"fold": k, "PR_AUC": pr_auc, "test_n": len(te)})
    save_fold_preds("xgb", k, te, s, y.iloc[te].values)

cv_xgb = pd.DataFrame(cv_rows)
cv_xgb.to_csv(f"{PRO}/cv_xgb_folds.csv", index=False)
print("XGB mean PR-AUC:", round(cv_xgb["PR_AUC"].mean(), 4))

In [None]:
!pip install -q lightgbm
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(
    n_estimators=1200, learning_rate=0.03,
    num_leaves=31, subsample=0.8, colsample_bytree=0.8,
    reg_lambda=1.0, random_state=42, n_jobs=-1
)

oof_lgb = np.full(len(y), np.nan, dtype=float)
cv_rows = []

for k, (tr, te) in enumerate(tscv.split(X), 1):
    lgb.fit(X.iloc[tr], y.iloc[tr])
    s = lgb.predict_proba(X.iloc[te])[:,1]
    oof_lgb[te] = s
    pr_auc, _ = pr_auc_and_sweep(y.iloc[te], s)
    cv_rows.append({"fold": k, "PR_AUC": pr_auc, "test_n": len(te)})
    save_fold_preds("lgb", k, te, s, y.iloc[te].values)

cv_lgb = pd.DataFrame(cv_rows)
cv_lgb.to_csv(f"{PRO}/cv_lgb_folds.csv", index=False)
print("LGB mean PR-AUC:", round(cv_lgb["PR_AUC"].mean(), 4))

In [None]:
oof = meta_idx.copy()
oof["y_true"] = y.values
oof["logit"]  = oof_logit
oof["xgb"]    = oof_xgb
oof["lgb"]    = oof_lgb

fold_id = np.full(len(oof), np.nan)
for name in ["logit","xgb","lgb"]:
    for k in range(1, 6):
        dfk = pd.read_csv(f"{PRO}/clf_fold_{name}_{k}.csv", parse_dates=["date"])
        key = oof[["date","fund_id"]].merge(
            dfk[["date","fund_id"]].assign(fold=k),
            on=["date","fund_id"], how="left"
        )["fold"].values
        mask = np.isnan(fold_id) & ~np.isnan(key)
        fold_id[mask] = key[mask]
oof["fold"] = fold_id

oof["ens_tab"] = np.nanmean(oof[["logit","xgb","lgb"]].values, axis=1)

oof_path = f"{PRO}/clf_oof_predictions.csv"
oof.to_csv(oof_path, index=False)
print("Saved:", oof_path)

valid_mask = oof[["y_true","ens_tab"]].notna().all(axis=1)
if valid_mask.sum() == 0:
    th_best = 0.30
    print("No valid rows; using fallback th_best=0.30")
else:
    yv, sv = oof.loc[valid_mask, "y_true"].values, oof.loc[valid_mask, "ens_tab"].values
    th_best, P, R, F1 = business_aware_threshold(yv, sv, min_precision=0.7)
    print(f"Chosen th_best={th_best:.4f} | P={P:.3f} R={R:.3f} F1={F1:.3f}")

ens_rows = []
for k in range(1, 6):
    m = (oof["fold"] == k) & ~np.isnan(oof["ens_tab"])
    if m.sum():
        pr_auc, _ = pr_auc_and_sweep(oof.loc[m, "y_true"].values, oof.loc[m, "ens_tab"].values)
        ens_rows.append({"fold": k, "PR_AUC": pr_auc, "test_n": int(m.sum())})

cv_ens = pd.DataFrame(ens_rows)
cv_ens.to_csv(f"{PRO}/cv_ensemble_folds.csv", index=False)
print("Ensemble mean PR-AUC:", round(cv_ens["PR_AUC"].mean(), 4))

In [None]:
import joblib, json, numpy as np, pandas as pd

logit.fit(X, y)
xgb.fit(X, y)
lgb.fit(X, y)

as_of = meta_idx["date"].max()
mask_last = (meta_idx["date"] == as_of).values
X_last = X.loc[mask_last]
idx_last = meta_idx.loc[mask_last].reset_index(drop=True)

s_logit = logit.predict_proba(X_last)[:,1]
s_xgb   = xgb.predict_proba(X_last)[:,1]
s_lgb   = lgb.predict_proba(X_last)[:,1]
s_ens   = np.nanmean(np.c_[s_logit, s_xgb, s_lgb], axis=1)

pred = idx_last.copy()
pred["score_logit"] = s_logit
pred["score_xgb"]   = s_xgb
pred["score_lgb"]   = s_lgb
pred["score_ens"]   = s_ens

def to_5class(prob, base):
    if prob >= base + 0.20: return "Strong Buy"
    if prob >= base + 0.10: return "Buy"
    if base - 0.10 <= prob < base + 0.10: return "Neutral"
    if base - 0.20 <= prob < base - 0.10: return "Sell"
    return "Strong Sell"

pred["signal"] = pred["score_ens"].apply(lambda p: to_5class(p, th_best))
pred.to_csv(f"{PRO}/leaderboard_model.csv", index=False)
pred.to_csv(f"{PRO}/model_predictions.csv", index=False)
print("Saved latest predictions →", f"{PRO}/leaderboard_model.csv")

joblib.dump(logit, f"{MDIR}/logit.pkl")
joblib.dump(xgb,   f"{MDIR}/xgb.pkl")
joblib.dump(lgb,   f"{MDIR}/lgb.pkl")

with open(f"{MDIR}/meta.json","w") as f:
    json.dump({
        "seed": 42,
        "horizon_days": int(H),
        "features": feat_cols,
        "cv_splits": 5,
        "as_of": str(as_of.date()),
        "business_threshold": float(th_best)
    }, f, indent=2)

print("Saved models and meta.")
print("Signal distribution on latest date:")
print(pred["signal"].value_counts())
pred.head(10)

In [None]:
from lightgbm import LGBMRegressor

reg_mask = dfm[feat_cols + ["y_excess_63d"]].notna().all(axis=1)
Xr = dfm.loc[reg_mask, feat_cols].astype(float)
yr = dfm.loc[reg_mask, "y_excess_63d"].astype(float).values
idx_r = dfm.loc[reg_mask, ["date","fund_id"]].reset_index(drop=True)

lgb_m = LGBMRegressor(objective="quantile", alpha=0.5, n_estimators=800, learning_rate=0.05,
                      num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
lgb_l = LGBMRegressor(objective="quantile", alpha=0.1, n_estimators=800, learning_rate=0.05,
                      num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
lgb_u = LGBMRegressor(objective="quantile", alpha=0.9, n_estimators=800, learning_rate=0.05,
                      num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)

lgb_m.fit(Xr, yr); lgb_l.fit(Xr, yr); lgb_u.fit(Xr, yr)

as_of = idx_r["date"].max()
ix = (idx_r["date"] == as_of).values
Xr_last = Xr.loc[ix]
idx_last_r = idx_r.loc[ix].reset_index(drop=True)

p50 = lgb_m.predict(Xr_last)
p10 = lgb_l.predict(Xr_last)
p90 = lgb_u.predict(Xr_last)

pi = idx_last_r.copy()
pi["pred_med"]    = p50
pi["pred_lo_p10"] = p10
pi["pred_hi_p90"] = p90

def decide(lo, hi):
    if lo > 0: return "Buy"
    if hi < 0: return "Sell"
    return "Hold"

pi["policy"] = [decide(a,b) for a,b in zip(pi["pred_lo_p10"], pi["pred_hi_p90"])]
pi.to_csv(f"{PRO}/policy_simulation.csv", index=False)
pi.head(10)

In [None]:
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

def smape(y_true, y_pred, eps=1e-9):
    y_true, y_pred = np.asarray(y_true, float), np.asarray(y_pred, float)
    denom = (np.abs(y_true) + np.abs(y_pred)).clip(min=eps)
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / denom))

def bootstrap_ci(metric_fn, y, yhat, B=1000, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y, float); yhat = np.asarray(yhat, float)
    n = len(y); stats = []
    for _ in range(B):
        idx = rng.integers(0, n, n)
        stats.append(metric_fn(y[idx], yhat[idx]))
    m = float(np.mean(stats)); lo, hi = np.percentile(stats, [2.5, 97.5])
    return m, float(lo), float(hi)

dfm = dfm.sort_values(["fund_id","date"]).reset_index(drop=True)
dfm["y_naive_63"] = dfm.groupby("fund_id")["y_excess_63d"].shift(63)
reg_mask = dfm[feat_cols + ["y_excess_63d"]].notna().all(axis=1)
Xr = dfm.loc[reg_mask, feat_cols].astype(float)
yr = dfm.loc[reg_mask, "y_excess_63d"].astype(float).values
y_naive_all = dfm.loc[reg_mask, "y_naive_63"].reset_index(drop=True).values
idx_r = dfm.loc[reg_mask, ["date","fund_id"]].reset_index(drop=True)

tscv_reg = TimeSeriesSplit(n_splits=5)
oof_p50 = np.full(len(Xr), np.nan, dtype=float)

for k, (tr, te) in enumerate(tscv_reg.split(Xr), 1):
    m50 = LGBMRegressor(objective="quantile", alpha=0.5, n_estimators=800, learning_rate=0.05,
                        num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
    m50.fit(Xr.iloc[tr], yr[tr])
    pred_te = m50.predict(Xr.iloc[te])
    oof_p50[te] = pred_te

    reg_fold = idx_r.iloc[te].copy()
    reg_fold["y_true"] = yr[te]
    reg_fold["y_hat_p50"] = pred_te
    reg_fold["y_hat_naive"] = y_naive_all[te]
    reg_fold["fold"] = k
    reg_fold.to_csv(f"{PRO}/reg_fold_p50_{k}.csv", index=False)

valid = (~np.isnan(oof_p50)) & (~np.isnan(yr)) & (~np.isnan(y_naive_all))
y_true_eval   = yr[valid]
y_model_eval  = oof_p50[valid]
y_naive_eval  = y_naive_all[valid]

mae_n, mae_n_lo, mae_n_hi = bootstrap_ci(mean_absolute_error, y_true_eval, y_naive_eval)
sm_n,  sm_n_lo,  sm_n_hi  = bootstrap_ci(smape,              y_true_eval, y_naive_eval)

mae_m, mae_m_lo, mae_m_hi = bootstrap_ci(mean_absolute_error, y_true_eval, y_model_eval)
sm_m,  sm_m_lo,  sm_m_hi  = bootstrap_ci(smape,              y_true_eval, y_model_eval)

reg_cmp = pd.DataFrame({
    "model":        ["Seasonal-Naive(63d)", "Quantile-LGB p50 (OOF)"],
    "n_samples":    [int(valid.sum())] * 2,
    "MAE":          [mae_n,  mae_m],
    "MAE_lo95":     [mae_n_lo, mae_m_lo],
    "MAE_hi95":     [mae_n_hi, mae_m_hi],
    "SMAPE":        [sm_n,   sm_m],
    "SMAPE_lo95":   [sm_n_lo, sm_m_lo],
    "SMAPE_hi95":   [sm_n_hi, sm_m_hi],
}).round(6)

reg_cmp_path = f"{RPT}/regression_baseline_vs_model.csv"
reg_cmp.to_csv(reg_cmp_path, index=False)
print("Saved →", reg_cmp_path)
reg_cmp

In [None]:
import os, numpy as np, pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, precision_recall_curve
)

def mean_pr_from_candidates(paths):
    for p in paths:
        if os.path.exists(p):
            df = pd.read_csv(p)
            if "PR_AUC" in df.columns and len(df):
                return float(df["PR_AUC"].mean())
    return np.nan

pr_logit = mean_pr_from_candidates([f"{PRO}/cv_logistic.csv", f"{PRO}/cv_logistic_folds.csv", f"{PRO}/cv_logit_folds.csv", f"{PRO}/per_fold_eval_summary.csv"])
pr_xgb   = mean_pr_from_candidates([f"{PRO}/cv_xgb.csv",      f"{PRO}/cv_xgb_folds.csv",                               f"{PRO}/per_fold_eval_summary.csv"])
pr_lgb   = mean_pr_from_candidates([f"{PRO}/cv_lgb.csv",      f"{PRO}/cv_lgb_folds.csv",                               f"{PRO}/per_fold_eval_summary.csv"])

oof_path = f"{PRO}/clf_oof_predictions.csv"
pr_ens = np.nan
if os.path.exists(oof_path):
    oof_df_tmp = pd.read_csv(oof_path, usecols=["y_true","ens_tab"])
    m = oof_df_tmp.dropna(subset=["y_true","ens_tab"])
    if len(m): pr_ens = float(average_precision_score(m["y_true"].values, m["ens_tab"].values))

res = pd.DataFrame({
    "model":  ["Logistic","XGBoost","LightGBM","Ensemble"],
    "PR_AUC": [pr_logit, pr_xgb,   pr_lgb,     pr_ens]
})
res.to_csv(f"{PRO}/results_summary_classification.csv", index=False)
print("Saved →", f"{PRO}/results_summary_classification.csv")
display(res.round(4))

In [None]:
oof_df = pd.read_csv(oof_path, parse_dates=["date"])
if "fold" not in oof_df.columns:
    raise ValueError("`fold` column missing in OOF file.")

model_cols = [c for c in ["logit","xgb","lgb","ens_tab"] if c in oof_df.columns]

def best_threshold(y_true, scores):
    prec, rec, th = precision_recall_curve(y_true, scores)
    f1 = 2 * prec * rec / (prec + rec + 1e-12)
    idx = int(np.nanargmax(f1[:-1])) if len(f1) > 1 else 0
    return (th[idx] if idx < len(th) else 0.5), float(prec[idx]), float(rec[idx]), float(f1[idx])

rows = []
for k in sorted(oof_df["fold"].dropna().unique()):
    part = oof_df.loc[oof_df["fold"] == k].copy()
    y_true_k = part["y_true"].values.astype(int)
    for m in model_cols:
        s = part[m].values.astype(float)
        valid = ~np.isnan(s)
        if valid.sum() == 0: 
            continue
        yv, sv = y_true_k[valid], s[valid]
        pr_auc = average_precision_score(yv, sv)
        th, p, r, f1 = best_threshold(yv, sv)
        yhat = (sv >= th).astype(int)
        tn, fp, fn, tp = confusion_matrix(yv, yhat).ravel()
        cm_df = pd.DataFrame([[tn, fp],[fn, tp]],
                             columns=["Pred 0","Pred 1"], index=["Actual 0","Actual 1"])
        cm_df.to_csv(os.path.join(PRO, f"cm_{m}_fold{k}.csv"))
        rows.append({
            "model": m, "fold": int(k), "n": int(len(yv)),
            "PR_AUC": float(pr_auc), "threshold": float(th),
            "Precision": float(p), "Recall": float(r), "F1": float(f1),
            "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp)
        })

if rows:
    per_fold = pd.DataFrame(rows).sort_values(["model","fold"]).reset_index(drop=True)
    per_fold.to_csv(f"{PRO}/per_fold_eval_summary.csv", index=False)
    avg = (per_fold
           .groupby("model")
           .apply(lambda g: pd.Series({
               "folds": int(g["fold"].nunique()),
               "total_n": int(g["n"].sum()),
               "PR_AUC_mean": float(np.average(g["PR_AUC"], weights=g["n"])),
               "Precision_mean": float(np.average(g["Precision"], weights=g["n"])),
               "Recall_mean": float(np.average(g["Recall"], weights=g["n"])),
               "F1_mean": float(np.average(g["F1"], weights=g["n"]))
           }))
           .reset_index())
    avg.to_csv(f"{PRO}/per_fold_eval_averaged.csv", index=False)
    print("Saved →", f"{PRO}/per_fold_eval_summary.csv")
    print("Saved →", f"{PRO}/per_fold_eval_averaged.csv")
    display(avg.round(4))
else:
    print("No per-fold results produced.")

In [None]:
import pandas as pd, numpy as np
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

oof = pd.read_csv(f"{PRO}/clf_oof_predictions.csv")
cv_logit = pd.read_csv(f"{PRO}/cv_logistic_folds.csv")
cv_xgb   = pd.read_csv(f"{PRO}/cv_xgb_folds.csv")
cv_lgb   = pd.read_csv(f"{PRO}/cv_lgb_folds.csv")

w = {
    "logit": cv_logit["PR_AUC"].mean(),
    "xgb":   cv_xgb["PR_AUC"].mean(),
    "lgb":   cv_lgb["PR_AUC"].mean()
}
w_sum = sum(w.values())
for k in w: w[k] /= w_sum

oof["ens_weighted"] = (
    w["logit"] * oof["logit"] +
    w["xgb"]   * oof["xgb"] +
    w["lgb"]   * oof["lgb"]
)

y_true = oof["y_true"].values
scores = oof["ens_weighted"].values
prec, rec, th = precision_recall_curve(y_true, scores)

candidates = [(t, P, R) for P, R, t in zip(prec, rec, th) if P>=0.70 and R>=0.30]
if candidates:
    best = sorted(candidates, key=lambda x: -2*x[1]*x[2]/(x[1]+x[2]+1e-12))[0]
else:
    f1 = 2*prec*rec/(prec+rec+1e-12)
    idx = np.nanargmax(f1[:-1])
    best = (th[idx], prec[idx], rec[idx])

th_best, P_best, R_best = best[:3]
pr_auc = average_precision_score(y_true, scores)
print(f"Weighted Ensemble PR_AUC={pr_auc:.3f} | τ={th_best:.3f} | P={P_best:.3f} | R={R_best:.3f}")

pd.DataFrame({
    "model":["WeightedEnsemble"],
    "PR_AUC":[pr_auc],
    "threshold":[th_best],
    "precision":[P_best],
    "recall":[R_best]
}).to_csv(f"{PRO}/weighted_ensemble_metrics.csv", index=False)

plt.figure(figsize=(6,4))
for m in ["logit","xgb","lgb","ens_tab","ens_weighted"]:
    if m not in oof.columns: continue
    y, s = oof["y_true"], oof[m]
    Prec, Rec, _ = precision_recall_curve(y, s)
    plt.plot(Rec, Prec, label=m)
plt.title("Precision–Recall Curves (OOF)")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend(); plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"{PRO}/pr_curve_models.png", dpi=160)

plt.figure(figsize=(5,4))
prob_true, prob_pred = calibration_curve(y_true, oof["ens_weighted"], n_bins=15)
plt.plot(prob_pred, prob_true, marker='o', label='Ensemble (weighted)')
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel("Predicted probability"); plt.ylabel("True positive rate")
plt.title("Calibration Curve")
plt.legend(); plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"{PRO}/calibration_plot.png", dpi=160)
plt.show()

yhat = (oof["ens_weighted"] >= th_best).astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
print(f"Confusion: TP={tp} FP={fp} TN={tn} FN={fn}")

err = oof.copy()
err["pred_bin"] = yhat
err["error_type"] = np.select(
    [(y_true==1)&(yhat==0), (y_true==0)&(yhat==1)],
    ["False Negative","False Positive"],
    default="Correct"
)
sample_err = err.loc[err["error_type"]!="Correct"]\
    .sort_values("ens_weighted", ascending=False)\
    .head(20)
sample_err.to_csv(f"{PRO}/error_ablation_top20.csv", index=False)
print("Saved representative FP/FN examples → error_ablation_top20.csv")