In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, json, numpy as np, pandas as pd
import matplotlib.pyplot as plt

BASE = "/content/drive/MyDrive/MLBA_Project"
PRO  = f"{BASE}/data/processed"
RPT  = f"{BASE}/reports"
for d in [PRO, RPT]: os.makedirs(d, exist_ok=True)

oof = pd.read_csv(f"{PRO}/clf_oof_predictions.csv", parse_dates=["date"])
oof.columns = oof.columns.str.lower()

score_col = "ens_weighted" if "ens_weighted" in oof.columns else "ens_tab"
assert set(["date","fund_id","y_true"]).issubset(oof.columns), "OOF missing key columns"
assert score_col in oof.columns, f"Missing ensemble column: {score_col}"

if "y_excess_63d" not in oof.columns:
    mst = pd.read_csv(f"{PRO}/clean_master_union.csv", parse_dates=["date"])
    mst.columns = mst.columns.str.lower()
    mst = mst.sort_values(["fund_id","date"]).reset_index(drop=True)

    H = 63
    mst["fwd_ret_63d_fund"] = mst.groupby("fund_id")["nav"].shift(-H) / mst["nav"] - 1.0
    tri_daily = mst[["date","tri"]].drop_duplicates().sort_values("date").copy()
    tri_daily["tri_fwd"] = tri_daily["tri"].shift(-H)
    tri_daily["fwd_ret_63d_bmk"] = tri_daily["tri_fwd"] / tri_daily["tri"] - 1.0
    mst = mst.merge(tri_daily[["date","fwd_ret_63d_bmk"]], on="date", how="left")
    mst["y_excess_63d"] = mst["fwd_ret_63d_fund"] - mst["fwd_ret_63d_bmk"]

    oof = oof.merge(mst[["date","fund_id","y_excess_63d"]], on=["date","fund_id"], how="left")

feat = pd.read_csv(f"{PRO}/clean_with_features.csv", parse_dates=["date"])
feat.columns = feat.columns.str.lower()

y_true = oof["y_true"].values
scores = oof[score_col].values

In [None]:
from sklearn.metrics import average_precision_score

def bootstrap_ci_prauc(y, s, B=1000, seed=42):
    rng = np.random.default_rng(seed)
    y, s = np.asarray(y), np.asarray(s)
    m = (~np.isnan(y)) & (~np.isnan(s))
    y, s = y[m].astype(int), s[m].astype(float)
    if len(y) == 0: return np.nan, np.nan, np.nan
    idxs = rng.integers(0, len(y), size=(B, len(y)))
    vals = [average_precision_score(y[i], s[i]) for i in idxs]
    mean = float(np.mean(vals)); lo, hi = np.percentile(vals, [2.5, 97.5])
    return mean, float(lo), float(hi)

rows = []
for name in ["logit","xgb","lgb",score_col]:
    if name not in oof.columns: continue
    m, lo, hi = bootstrap_ci_prauc(oof["y_true"], oof[name])
    label = "Ensemble(Weighted)" if name == "ens_weighted" else ("Ensemble" if name == "ens_tab" else name.capitalize())
    rows.append({"model": label, "prauc_mean": m, "prauc_lo95": lo, "prauc_hi95": hi})

ci_df = pd.DataFrame(rows)
ci_path = f"{RPT}/pr_auc_ci.csv"
ci_df.to_csv(ci_path, index=False)
print("Saved →", ci_path)
ci_df.round(4)

In [None]:
from sklearn.metrics import precision_recall_curve

def threshold_sweep_with_cost(y, s, realized_excess=None, tx=0.002):
    y, s = np.asarray(y), np.asarray(s)
    m = (~np.isnan(y)) & (~np.isnan(s))
    yv, sv = y[m].astype(int), s[m].astype(float)
    prec, rec, th = precision_recall_curve(yv, sv)
    sweep = pd.DataFrame({"threshold": list(th) + [np.inf], "precision": prec, "recall": rec})

    re_all = None
    if realized_excess is not None:
        r = np.asarray(realized_excess)
        re_all = np.where(m, np.nan_to_num(r, nan=0.0), np.nan)

    def cost_at(thr):
        pred = (sv >= thr).astype(int)
        if re_all is not None:
            re = re_all[~np.isnan(re_all)][0:len(sv)]
            tp = (pred==1) & (yv==1)
            fp = (pred==1) & (yv==0)
            tp_net = re[tp].sum() - tx * tp.sum()
            fp_net = re[fp].sum() - tx * fp.sum()
            return float(tp_net + fp_net)
        else:
            return float(((pred==1)&(yv==1)).sum()*0.05 - ((pred==1)&(yv==0)).sum()*0.02 - tx*(pred==1).sum())

    sweep["cost"] = sweep["threshold"].apply(cost_at)
    return sweep

ens_sweep = threshold_sweep_with_cost(
    oof["y_true"].values,
    oof[score_col].values,
    realized_excess=oof["y_excess_63d"].values if "y_excess_63d" in oof.columns else None
)
ens_sweep_path = f"{RPT}/threshold_sweep_{score_col}.csv"
ens_sweep.to_csv(ens_sweep_path, index=False)
print("Saved →", ens_sweep_path)

plt.figure(figsize=(6,4))
x = ens_sweep["threshold"].replace(np.inf, np.nan)
plt.plot(x, ens_sweep["cost"])
plt.xlabel("Threshold"); plt.ylabel("Net payoff (arb. units)")
plt.title(f"Cost Curve — {('Weighted Ensemble' if score_col=='ens_weighted' else 'Ensemble')}")
plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(f"{RPT}/cost_curve_{score_col}.png", dpi=160)
plt.show()

ens_sweep.sort_values("cost", ascending=False).head(5)

In [None]:
def lift_and_topdecile(df, date_col, label_col, score_col, tag):
    m = df[[date_col, label_col, score_col]].dropna().copy()
    if m.empty:
        print(f"[{tag}] No valid rows."); return pd.DataFrame(), pd.DataFrame()

    m["score_q"] = m[score_col].rank(pct=True)
    m["bucket"] = (m["score_q"] * 10).astype(int).clip(0, 9)

    lift = (m.groupby("bucket", as_index=False)[label_col]
              .mean()
              .rename(columns={label_col: "positive_rate"}))
    lift_path = f"{RPT}/lift_by_bucket_{tag}.csv"
    lift.to_csv(lift_path, index=False)

    plt.figure(figsize=(6,4))
    plt.plot(lift["bucket"], lift["positive_rate"], marker="o")
    plt.xticks(range(0,10)); plt.xlabel("Score Decile (0=low, 9=high)")
    plt.ylabel("Positive Rate"); plt.title(f"Lift by Decile — {tag}")
    plt.grid(True, alpha=0.3); plt.tight_layout()
    lift_png = f"{RPT}/lift_by_bucket_{tag}.png"
    plt.savefig(lift_png, dpi=160); plt.show()
    print(f"[{tag}] Saved → {lift_path}, {lift_png}")

    m["year"] = pd.to_datetime(m[date_col]).dt.year
    by_year = (m[m["score_q"]>=0.90].groupby("year", as_index=False)[label_col]
                 .mean()
                 .rename(columns={label_col: "precision_top10"}))
    by_year_path = f"{RPT}/precision_by_year_top10_{tag}.csv"
    by_year.to_csv(by_year_path, index=False)
    print(f"[{tag}] Saved → {by_year_path}")
    return lift, by_year

tag = "ensemble_weighted" if score_col=="ens_weighted" else "ensemble"
ens_lift, ens_by_year = lift_and_topdecile(
    df=oof.rename(columns={score_col:"score"}),
    date_col="date", label_col="y_true", score_col="score", tag=tag
)

In [None]:
from sklearn.metrics import precision_recall_curve, confusion_matrix, average_precision_score

def balanced_business_threshold(y_true, scores, target_precision=0.70, min_recall=0.30):
    y = np.asarray(y_true, dtype=float); s = np.asarray(scores, dtype=float)
    m = (~np.isnan(y)) & (~np.isnan(s))
    if m.sum()==0: return 0.5, np.nan, np.nan
    prec, rec, th = precision_recall_curve(y[m].astype(int), s[m])
    for i in range(len(th)):
        if prec[i] >= target_precision and rec[i] >= min_recall:
            return th[i], float(prec[i]), float(rec[i])
    f1 = 2*prec*rec/(prec+rec+1e-12)
    idx = np.nanargmax(f1[:-1]); return th[idx], float(prec[idx]), float(rec[idx])

def dump_fp_fn(tag, df_scored, score_col, label_col="y_true", top_k=25):
    scores = df_scored[score_col].values
    labels = df_scored[label_col].values
    th_star, P, R = balanced_business_threshold(labels, scores, 0.70, 0.30)
    pred = (scores >= th_star).astype(int)
    tn, fp, fn, tp = confusion_matrix(labels, pred).ravel()
    pr_auc = average_precision_score(labels[~np.isnan(scores)], scores[~np.isnan(scores)])

    out = df_scored.copy()
    out["pred"] = pred
    out["margin"] = np.where(out["pred"]==1, out[score_col], 1.0 - out[score_col])

    FP = out[(out["pred"]==1) & (out[label_col]==0)].sort_values("margin", ascending=False).head(top_k)
    FN = out[(out["pred"]==0) & (out[label_col]==1)].sort_values("margin", ascending=False).head(top_k)

    FP.to_csv(f"{RPT}/representative_fp_{tag}.csv", index=False)
    FN.to_csv(f"{RPT}/representative_fn_{tag}.csv", index=False)

    cm_df = pd.DataFrame([[tn, fp],[fn, tp]], columns=["Pred 0","Pred 1"], index=["Actual 0","Actual 1"])
    cm_df.to_csv(f"{RPT}/cm_{tag}.csv")

    print(f"\n[{tag.upper()}] PR-AUC={pr_auc:.4f} | thr={th_star:.3f} | P={P:.3f} R={R:.3f}")
    print(cm_df)

base = oof[["date","fund_id","y_true",score_col]].dropna(subset=["y_true",score_col]).copy()
dump_fp_fn(tag, base.rename(columns={score_col:"score"}), score_col="score", label_col="y_true", top_k=25)

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMClassifier
from sklearn.metrics import average_precision_score

dfm = feat.merge(oof[["date","fund_id","y_true"]], on=["date","fund_id"], how="inner").copy()
dfm = dfm.sort_values(["fund_id","date"]).reset_index(drop=True)
ban = {"date","fund_id","nav","tri","y_true","y_excess_63d"}
feat_cols_all = [c for c in dfm.columns if (c not in ban) and pd.api.types.is_numeric_dtype(dfm[c])]

mask = dfm[feat_cols_all + ["y_true"]].notna().all(axis=1)
X0 = dfm.loc[mask, feat_cols_all].astype(float)
y0 = dfm.loc[mask, "y_true"].astype(int)

def run_lgb(X, y, n_splits=3, seed=42):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    ap = []
    for tr, te in tscv.split(X):
        m = LGBMClassifier(
            n_estimators=600, learning_rate=0.05, num_leaves=31,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            random_state=seed, n_jobs=-1
        )
        m.fit(X.iloc[tr], y.iloc[tr])
        s = m.predict_proba(X.iloc[te])[:,1]
        ap.append(average_precision_score(y.iloc[te], s))
    return float(np.mean(ap)) if len(ap) else np.nan

rows_abl = []
pr_base = run_lgb(X0, y0) if len(X0) and len(np.unique(y0))>1 else np.nan
rows_abl.append({"ablation":"baseline_all", "features":len(feat_cols_all), "PR_AUC": pr_base})

macro_keys = ["india_vix","usd_inr","gsec_10y","gold_inr","brent_usd"]
feat_nomacro = [c for c in feat_cols_all if not any(k in c for k in macro_keys)]
pr_nomacro = run_lgb(dfm.loc[mask, feat_nomacro].astype(float), y0) if len(feat_nomacro) else np.nan
rows_abl.append({"ablation":"no_macro", "features":len(feat_nomacro), "PR_AUC": pr_nomacro})

def alt_window_run(include_keys):
    include_keys = [k for k in include_keys if any(k in c for c in feat_cols_all)]
    if not include_keys: return np.nan, 0
    subset = [c for c in feat_cols_all if not any(k in c for k in ["ret_21d","excess_21d","ret_63d","excess_63d"])]
    subset += [c for c in feat_cols_all if any(k in c for k in include_keys)]
    subset = sorted(set(subset))
    Xm = dfm.loc[mask, subset].astype(float)
    return run_lgb(Xm, y0), len(subset)

for tag_alt, keys in [("windows_7d", ["ret_7d","excess_7d"]),
                      ("windows_14d",["ret_14d","excess_14d"])]:
    pr_alt, nfeat = alt_window_run(keys)
    rows_abl.append({"ablation":tag_alt, "features":nfeat, "PR_AUC": pr_alt})

abl_df = pd.DataFrame(rows_abl)
abl_path = f"{RPT}/ablation_window_macro.csv"
abl_df.to_csv(abl_path, index=False)
print("Saved →", abl_path)
abl_df.round(4)