In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, numpy as np, pandas as pd
import matplotlib.pyplot as plt

BASE = "/content/drive/MyDrive/MLBA_Project"
PRO  = f"{BASE}/data/processed"
RPT  = f"{BASE}/reports"
for d in [PRO, RPT]: os.makedirs(d, exist_ok=True)

oof = pd.read_csv(f"{PRO}/clf_oof_predictions.csv", parse_dates=["date"])
oof.columns = oof.columns.str.lower()
assert set(["date","fund_id","y_true"]).issubset(oof.columns)

score_col = "ens_weighted" if "ens_weighted" in oof.columns else "ens_tab"
if score_col not in oof.columns:
    raise ValueError("Missing ensemble column (ens_weighted or ens_tab) in OOF.")
print("Using score column:", score_col)

mst = pd.read_csv(f"{PRO}/clean_master_union.csv", parse_dates=["date"])
mst.columns = mst.columns.str.lower()
mst = mst.sort_values(["fund_id","date"]).reset_index(drop=True)

H = 63
if "y_excess_63d" not in mst.columns:
    nav_fwd = mst.groupby("fund_id")["nav"].shift(-H)
    mst["fwd_ret_63d_fund"] = nav_fwd / mst["nav"] - 1.0
    tri_daily = mst[["date","tri"]].drop_duplicates().sort_values("date").copy()
    tri_daily["tri_fwd"] = tri_daily["tri"].shift(-H)
    tri_daily["fwd_ret_63d_bmk"] = tri_daily["tri_fwd"] / tri_daily["tri"] - 1.0
    mst = mst.merge(tri_daily[["date","fwd_ret_63d_bmk"]], on="date", how="left")
    mst["y_excess_63d"] = mst["fwd_ret_63d_fund"] - mst["fwd_ret_63d_bmk"]

th_best = 0.30
th_path = f"{RPT}/threshold_choice.json"
if os.path.exists(th_path):
    import json
    try:
        th_best = float(json.load(open(th_path))["threshold"])
        print(f"Loaded τ from reports: {th_best:.3f}")
    except Exception:
        pass

CFG = dict(
    rebalance_freq="W-FRI",
    portfolio_size=8,
    threshold=th_best,
    max_weight=0.25,
    tx_cost=0.002,
    holding_days=H
)

print("CFG:", CFG)

In [None]:
def decision_calendar(df_dates: pd.Series, freq: str) -> pd.DatetimeIndex:
    cal = (pd.DataFrame({"date": pd.to_datetime(df_dates)})
             .dropna().drop_duplicates()
             .set_index("date").sort_index())
    if freq.upper() in ["BM", "BME"]:
        idx = cal.resample("BME").last().dropna().index
    else:
        idx = cal.resample(freq).last().dropna().index
    return idx

def select_topk(day_slice: pd.DataFrame, K: int, use_threshold: bool, thr: float, max_w: float):
    g = day_slice.dropna(subset=[score_col]).copy()
    if use_threshold:
        g = g.loc[g[score_col] >= thr]
    g = g.sort_values(score_col, ascending=False).head(K)
    if g.empty:
        return g
    w0 = min(1.0 / max(len(g), 1), max_w)
    g = g.assign(weight=w0)
    s = g["weight"].sum()
    if s > 0:
        g["weight"] = g["weight"] / s
    return g[["fund_id", score_col, "weight"]].copy()

def build_trades(oof_df: pd.DataFrame, freq: str, K: int, use_threshold: bool, thr: float, max_w: float):
    o = oof_df[["date","fund_id",score_col]].dropna().copy()
    dates = decision_calendar(o["date"], freq=freq)
    picks = []
    for d in dates:
        day = o.loc[o["date"] == d]
        if len(day) == 0:
            continue
        sel = select_topk(day, K=K, use_threshold=use_threshold, thr=thr, max_w=max_w)
        if sel.empty:
            continue
        sel = sel.assign(decision_date=d)
        picks.append(sel[["decision_date","fund_id","weight",score_col]])
    if not picks:
        return o.iloc[0:0][["date","fund_id"]].rename(columns={"date":"decision_date"}).assign(weight=np.nan)
    trades = pd.concat(picks, ignore_index=True)
    trades = trades.sort_values(["decision_date","fund_id"]).reset_index(drop=True)
    return trades

trades = build_trades(
    oof_df=oof, freq=CFG["rebalance_freq"], K=CFG["portfolio_size"],
    use_threshold=True, thr=CFG["threshold"], max_w=CFG["max_weight"]
)
print("Trades head:\n", trades.head(10))
print("Decision dates:", trades["decision_date"].nunique(), "| avg names/dec:",
      round(trades.groupby("decision_date")["fund_id"].nunique().mean(), 2))

In [None]:
class PortfolioSimulator:
    def __init__(self, holding_days=63, tx_cost=0.002):
        self.H = int(holding_days)
        self.tx = float(tx_cost)
        self.tranche_log = pd.DataFrame(columns=[
            "decision_date","n_pos","ret_excess","ret_excess_net","equity"
        ])
        self.position_log = []
        self.metrics = pd.Series(dtype=float)

        self._y_map = (mst[["date","fund_id","y_excess_63d"]]
                        .dropna().set_index(["date","fund_id"])["y_excess_63d"])

    def _realized_excess(self, d, f):
        try:
            return float(self._y_map.loc[(d, f)])
        except KeyError:
            return np.nan

    def run(self, trades_df: pd.DataFrame):
        if trades_df.empty:
            raise ValueError("No trades to simulate.")

        decisions = sorted(trades_df["decision_date"].drop_duplicates())
        equity = 1.0
        rows = []
        prev_set = set()

        for d in decisions:
            basket = trades_df.loc[trades_df["decision_date"] == d].copy()
            if basket.empty:
                continue

            basket["ret_excess"] = basket.apply(lambda r: self._realized_excess(d, r["fund_id"]), axis=1)
            basket["ret_excess_net"] = basket["ret_excess"] - 2.0 * self.tx * basket["weight"]

            r_raw = float((basket["weight"] * basket["ret_excess"]).sum())
            r_net = float((basket["weight"] * basket["ret_excess_net"]).sum())

            equity *= (1.0 + r_net)

            cur_set = set(basket["fund_id"])
            if prev_set:
                added = len(cur_set - prev_set)
                dropped = len(prev_set - cur_set)
                denom = max((len(cur_set) + len(prev_set)) / 2.0, 1.0)
                turnover = (added + dropped) / denom
            else:
                turnover = np.nan
            prev_set = cur_set

            hits = int((basket["ret_excess_net"] > 0).sum())
            self.position_log.append(basket.assign(hit=(basket["ret_excess_net"] > 0).astype(int)))

            rows.append(dict(
                decision_date=d, n_pos=int(len(basket)),
                ret_excess=r_raw, ret_excess_net=r_net,
                turnover=turnover, hits=hits, equity=equity
            ))

        self.tranche_log = pd.DataFrame(rows).sort_values("decision_date").reset_index(drop=True)

        tr = self.tranche_log.copy()
        if len(tr):
            bdays = tr["decision_date"].diff().dt.days.dropna()
            avg_gap = float(np.nanmean(bdays)) if len(bdays) else 21.0
            ann_factor = 252.0 / max(avg_gap, 1.0)

            r = tr["ret_excess_net"].fillna(0.0).values
            mu, sd = np.mean(r), np.std(r, ddof=1) if len(r) > 1 else 0.0
            sharpe = (mu * ann_factor) / (sd * np.sqrt(ann_factor) + 1e-12) if sd > 0 else np.nan

            eq = tr["equity"].values
            peak = np.maximum.accumulate(eq)
            dd = (eq / peak) - 1.0
            mdd = float(np.min(dd)) if len(dd) else np.nan

            hit_rate = float((r > 0).mean())
            n_obs = int(len(r))

            self.metrics = pd.Series({
                "annualized_excess_return": (np.prod(1 + r) ** ann_factor) - 1.0 if len(r) else np.nan,
                "sharpe": sharpe,
                "max_drawdown": mdd,
                "hit_rate": hit_rate,
                "observations": n_obs,
                "avg_gap_days": avg_gap,
                "avg_turnover": float(np.nanmean(tr["turnover"])) if "turnover" in tr else np.nan
            })
        return self.tranche_log

In [None]:
sim = PortfolioSimulator(holding_days=CFG["holding_days"], tx_cost=CFG["tx_cost"])
curve = sim.run(trades)

bt_tranche_path = f"{RPT}/bt_tranche_log.csv"
curve.to_csv(bt_tranche_path, index=False)
print("Saved →", bt_tranche_path)

import json
bt_metrics_path = f"{RPT}/bt_metrics.json"
json.dump(sim.metrics.to_dict(), open(bt_metrics_path,"w"), indent=2)
print("Metrics:\n", sim.metrics.round(4))
print("Saved →", bt_metrics_path)

plt.figure(figsize=(7,4))
plt.plot(curve["decision_date"], curve["equity"])
plt.title(f"Equity — {CFG['rebalance_freq']}, K={CFG['portfolio_size']}, τ={CFG['threshold']:.2f}")
plt.xlabel("Decision Date"); plt.ylabel("Equity (excess, cum)")
plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(f"{RPT}/bt_equity_base.png", dpi=160)
plt.show()

In [None]:
sel = trades.sort_values(["decision_date","fund_id"]).copy()

per_tranche = (sel.groupby("decision_date")["fund_id"]
                 .nunique()
                 .rename("positions")
                 .reset_index())

turn_vals = []
prev = set()
for d, g in sel.groupby("decision_date"):
    cur = set(g["fund_id"])
    if prev:
        added = len(cur - prev)
        dropped = len(prev - cur)
        denom = max((len(cur) + len(prev)) / 2.0, 1.0)
        val = (added + dropped) / denom
    else:
        val = np.nan
    turn_vals.append((d, val))
    prev = cur
turn_df = pd.DataFrame(turn_vals, columns=["decision_date","turnover"])

tl = sim.tranche_log[["decision_date","n_pos","ret_excess","ret_excess_net"]]
turn_stats = (per_tranche
              .merge(turn_df, on="decision_date", how="left")
              .merge(tl, on="decision_date", how="left"))

outp = f"{RPT}/bt_turnover_stats.csv"
turn_stats.to_csv(outp, index=False)
print("Saved →", outp)
print(turn_stats.describe([0.25,0.5,0.75]).round(3))

plt.figure(figsize=(7,4))
plt.plot(curve["decision_date"], curve["equity"], label="Strategy (excess)")
plt.plot(curve["decision_date"], np.ones(len(curve)), label="Zero-excess baseline")
plt.legend(); plt.title("Excess Equity vs Baseline")
plt.xlabel("Decision Date"); plt.ylabel("Equity")
plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(f"{RPT}/bt_equity_vs_zero.png", dpi=160)
plt.show()

In [None]:
from itertools import product

def run_bt(freq, K, use_thr, thr, max_w=0.25, tx=0.002):
    trd = build_trades(oof, freq=freq, K=K, use_threshold=use_thr, thr=thr, max_w=max_w)
    if trd.empty:
        return {"freq":freq, "K":K, "use_threshold":use_thr, "threshold":thr,
                "observations":0, "sharpe":np.nan, "annualized_excess_return":np.nan,
                "max_drawdown":np.nan, "hit_rate":np.nan, "avg_turnover":np.nan}, pd.DataFrame()
    sim_g = PortfolioSimulator(holding_days=H, tx_cost=tx)
    pnl = sim_g.run(trd)
    met = sim_g.metrics.to_dict()
    met.update({"freq":freq, "K":K, "use_threshold":use_thr, "threshold":thr,
                "observations":int(len(pnl))})
    return met, pnl.assign(cfg=f"{freq}|K={K}|{'thr' if use_thr else 'no_thr'}={thr:.2f}")

grid_freq = ["W-FRI", "BME"]
grid_K    = [3, 5, 8]
grid_thr  = [0.25, 0.30, 0.35]

results, curves = [], {}
for f, K in product(grid_freq, grid_K):
    met, pnl = run_bt(f, K, use_thr=False, thr=0.0, max_w=0.25, tx=CFG["tx_cost"])
    results.append(met); curves[(f,K,"no_thr")] = pnl

for f, K, th in product(grid_freq, grid_K, grid_thr):
    met, pnl = run_bt(f, K, use_thr=True, thr=th, max_w=0.25, tx=CFG["tx_cost"])
    results.append(met); curves[(f,K,f"thr{th:.2f}")] = pnl

robust = pd.DataFrame(results)
robust_path = f"{RPT}/bt_robustness_grid.csv"
robust.to_csv(robust_path, index=False)
print("Saved →", robust_path)
display(robust.sort_values("sharpe", ascending=False).head(10).round(3))

top_keys = (robust.sort_values("sharpe", ascending=False).head(3)
                 .apply(lambda r: (r["freq"], r["K"],
                                   "no_thr" if not r["use_threshold"] else f"thr{r['threshold']:.2f}"), axis=1)
                 .tolist())

plt.figure(figsize=(7,4))
for key in top_keys:
    tag = key[2]
    for k, pnl in curves.items():
        if k[0]==key[0] and k[1]==key[1] and (tag in k[2]):
            label = f"{k[0]}, K={k[1]}, {k[2]}"
            if not pnl.empty:
                plt.plot(pnl["decision_date"], pnl["equity"], label=label)
            break
plt.title("Equity (top configs)")
plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(f"{RPT}/bt_robust_equity.png", dpi=160)
plt.show()