Imports, config, and paths

In [1]:
# Week 5 — Regime Classifier (clean, reproducible, feedback-incorporated)

import sys, json, platform, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Reproducibility snapshot
VERS = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
}
for pkg in ["numpy","pandas","scipy","lightgbm","sklearn","matplotlib","seaborn","cvxpy","statsmodels"]:
    try:
        mod = __import__(pkg)
        VERS[pkg] = getattr(mod, "__version__", "unknown")
    except Exception:
        VERS[pkg] = "not-installed"

# Paths
W5 = Path.cwd().resolve()
FN = {
    "feat_w3":       "features_week3.parquet",     # Week 3 scenario-level features
    "cube_npy":      "synthetic_paths_20d.npy",    # optional fallback for mapping
    "model_pkl":     "week5_regime_model.pkl",     # saved classifier bundle
    "metrics_json":  "week5_metrics.json",         # summary metrics
    "env_json":      "week5_env.json",             # environment snapshot
    # plots
    "cm_png":        "week5_confusion_matrix.png",
    "reliab_png":    "week5_reliability_curve.png",
    "roc_png":       "week5_roc_ovr.png",
    "imp_png":       "week5_feature_importance.png",
}

def must(path_like):
    p = W5 / path_like
    if not p.exists():
        raise FileNotFoundError(f"Missing required file: {p}")
    return p

def try_path(path_like):
    p = W5 / path_like
    return p if p.exists() else None

# Parameters (note: alpha used later in portfolio work; kept consistent here)
SEED  = 42
ALPHA = 0.95   # keep consistent across weeks
np.random.seed(SEED)

# Save environment snapshot
(W5 / FN["env_json"]).write_text(json.dumps(VERS, indent=2))
print("Saved environment snapshot →", W5 / FN["env_json"])


Saved environment snapshot → C:\Users\Krish\OneDrive - Curtin\Desktop\University\University\2025 Sem 2\MATH3004\22155584_Week_5\week5_env.json


Feature helpers (same schema as Week 3 scenario features)

In [2]:
# Load Week 3 features
feat_w3 = pd.read_parquet(must(FN["feat_w3"]))
assert len(feat_w3) > 0, "features_week3.parquet is empty."

# Identify candidate label columns and numeric feature columns
LABEL_CANDIDATES = ["regime", "regime_consensus", "cluster", "kmeans_label"]
label_col = next((c for c in LABEL_CANDIDATES if c in feat_w3.columns), None)
if label_col is None:
    raise ValueError(f"No regime/cluster label found in features_week3.parquet. "
                     f"Expected one of {LABEL_CANDIDATES}.")

# Prefer using precomputed portfolio-level summary for mapping if present
PORT_MEAN_COL = "port_mean"  # from Week 3 features (recommended)
have_port_mean = PORT_MEAN_COL in feat_w3.columns

# Optional fallback: compute equal-weight portfolio mean from scenario cube
r_port_from_cube = None
if not have_port_mean:
    cube_path = try_path(FN["cube_npy"])
    if cube_path is None:
        raise FileNotFoundError("features_week3.parquet lacks 'port_mean' AND synthetic_paths_20d.npy not found.")
    cube = np.load(cube_path)                       # (N, H, A) simple returns
    N, H, A = cube.shape
    w_eq = np.ones(A) / A
    # scenario horizon return of equal-weight portfolio
    r_port_from_cube = (1.0 + (cube @ w_eq)).prod(axis=1) - 1.0   # (N,)
    # Align length if features got trimmed
    if len(r_port_from_cube) != len(feat_w3):
        M = min(len(r_port_from_cube), len(feat_w3))
        r_port_from_cube = r_port_from_cube[:M]
        feat_w3 = feat_w3.iloc[:M].copy()

print("Week 3 features shape:", feat_w3.shape, "| label_col:", label_col)


Week 3 features shape: (5000, 9) | label_col: regime_consensus


Prepare training data (X, y) from Week 3 features

In [3]:
# Build deterministic mapping by ordering mean portfolio return per cluster
labels_raw = feat_w3[label_col].copy()

# Normalize to integer cluster ids
if not np.issubdtype(labels_raw.dtype, np.number):
    cats = labels_raw.astype("category")
    clusters = cats.cat.codes.to_numpy()  # 0..K-1
else:
    clusters = labels_raw.to_numpy().astype(int)

K = int(pd.Series(clusters).nunique())
assert K >= 2, "Need at least 2 clusters for regime mapping."

# Use port_mean if available; else fallback from cube calc
if have_port_mean:
    r_port = feat_w3[PORT_MEAN_COL].astype(float).to_numpy()
else:
    r_port = r_port_from_cube
    assert r_port is not None, "r_port could not be computed."

# Per-cluster mean portfolio return
df_tmp = pd.DataFrame({"cluster": clusters, "r": r_port})
mu_per_cluster = df_tmp.groupby("cluster")["r"].mean().sort_values()
ranked_clusters = list(mu_per_cluster.index)

# Map smallest → bear, largest → bull, middle → neutral
reg_map = {}
if K == 2:
    reg_map = {ranked_clusters[0]: "bear", ranked_clusters[1]: "bull"}
else:
    reg_map[ranked_clusters[0]] = "bear"
    reg_map[ranked_clusters[-1]] = "bull"
    for c in ranked_clusters[1:-1]:
        reg_map[c] = "neutral"

reg_labels = pd.Series(clusters).map(reg_map).astype("category")
assert reg_labels.notna().all(), "All scenarios must be assigned a regime."

# Data-driven regime mix (frequency)
REGIME_MIX = reg_labels.value_counts(normalize=True).to_dict()
# Required assertions
assert set(REGIME_MIX.keys()).issubset({"bear","neutral","bull"})
assert abs(sum(REGIME_MIX.values()) - 1.0) < 1e-8, "REGIME_MIX must sum to 1."

print("Regime mapping OK. Mix:", REGIME_MIX)


Regime mapping OK. Mix: {'bull': 0.4168, 'neutral': 0.3916, 'bear': 0.1916}


Train regime classifier (LightGBM → RF fallback)

In [4]:
# Define target encoding for classifier
REG_ORDER = ["bear", "neutral", "bull"]
reg_to_int = {r:i for i,r in enumerate(REG_ORDER)}
y = reg_labels.map(reg_to_int).to_numpy().astype(int)

# Feature columns: numeric, exclude labels/ids
exclude_cols = set(LABEL_CANDIDATES) | {"scenario_id", "regime", "regime_consensus"}
num_cols = [c for c in feat_w3.columns
            if c not in exclude_cols and pd.api.types.is_numeric_dtype(feat_w3[c])]

if len(num_cols) < 3:
    raise ValueError("Not enough numeric feature columns for classification. "
                     "Expected Week 3 features like port_mean, port_std, xsec_corr, etc.")

X = feat_w3[num_cols].copy()
X = X.replace([np.inf, -np.inf], np.nan).fillna(X.median(numeric_only=True))

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=SEED, stratify=y
)

print("Train:", X_tr.shape, " Test:", X_te.shape, "  #features:", len(num_cols))


Train: (3750, 8)  Test: (1250, 8)   #features: 8


Compute “current state” features from Week 1 returns and predict regime mix

In [5]:
# Handle class imbalance via class_weight; keep sample_weight optional for eval
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, log_loss, brier_score_loss, roc_auc_score
from sklearn.preprocessing import label_binarize

USE_LGBM = True
try:
    from lightgbm import LGBMClassifier
except Exception:
    USE_LGBM = False
from sklearn.ensemble import RandomForestClassifier

# Class weights (inverse frequency)
cls_counts = pd.Series(y_tr).value_counts().to_dict()
weights = {c: (len(y_tr) / (3.0 * cls_counts[c])) for c in cls_counts}

if USE_LGBM:
    base = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        max_depth=-1,
        num_leaves=31,
        objective="multiclass",
        num_class=len(REG_ORDER),
        class_weight=weights,
        random_state=SEED,
    )
    base.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], eval_metric="multi_logloss")
else:
    base = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=SEED,
        n_jobs=-1
    ).fit(X_tr, y_tr)

# Calibrate probabilities
clf = CalibratedClassifierCV(base, method="isotonic", cv=3)
clf.fit(X_tr, y_tr)

# Predictions/probs
y_pred = clf.predict(X_te)
proba  = clf.predict_proba(X_te)

# Metrics
rep = classification_report(y_te, y_pred, target_names=REG_ORDER, output_dict=True, zero_division=0)
cm  = confusion_matrix(y_te, y_pred)
ll  = log_loss(y_te, proba, labels=list(range(len(REG_ORDER))))
# Brier (multiclass -> mean of per-class Brier on one-vs-rest)
Y_bin = label_binarize(y_te, classes=list(range(len(REG_ORDER))))
brier = float(np.mean((proba - Y_bin)**2))

# ROC-AUC (OvR, macro)
try:
    auc = roc_auc_score(Y_bin, proba, average="macro", multi_class="ovr")
except Exception:
    auc = np.nan

metrics = {
    "macro_f1": rep["macro avg"]["f1-score"],
    "macro_precision": rep["macro avg"]["precision"],
    "macro_recall": rep["macro avg"]["recall"],
    "accuracy": rep["accuracy"],
    "log_loss": ll,
    "brier": brier,
    "roc_auc_ovr_macro": auc,
    "class_report": rep,
}

print({k: round(v, 4) if isinstance(v, (int,float)) else "..." for k,v in metrics.items() if k != "class_report"})


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 3750, number of used features: 6
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 2500, number of used features: 6
[LightGBM] [Info] Start training from score -1.098035
[LightGBM] [Info] Start training from score -1.099071
[LightGBM] [Info] Start training from score -1.098731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds

Build regime-weighted scenario probabilities p (using Week 3 labels)

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

# Confusion matrix
plt.figure(figsize=(4.5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=REG_ORDER, yticklabels=REG_ORDER)
plt.title("Confusion Matrix (test)")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout(); plt.savefig(W5 / FN["cm_png"], dpi=150); plt.close()

# Reliability curve (per class, simple binning)
def reliability_curve(y_true, proba, n_bins=10):
    """Return per-class reliability: (bin_centers, mean_pred, empirical_freq) list."""
    out = []
    Yb = label_binarize(y_true, classes=list(range(len(REG_ORDER))))
    edges = np.linspace(0, 1, n_bins+1)
    centers = 0.5*(edges[1:]+edges[:-1])
    for k in range(len(REG_ORDER)):
        p = proba[:, k]
        t = Yb[:, k]
        m_pred, m_true = [], []
        for i in range(n_bins):
            lo, hi = edges[i], edges[i+1]
            mask = (p >= lo) & (p < hi) if i < n_bins-1 else (p >= lo) & (p <= hi)
            if mask.sum() == 0:
                m_pred.append(np.nan); m_true.append(np.nan)
            else:
                m_pred.append(p[mask].mean())
                m_true.append(t[mask].mean())
        out.append((centers, np.array(m_pred), np.array(m_true)))
    return out

rel = reliability_curve(y_te, proba, n_bins=12)
plt.figure(figsize=(6.5,4))
for k, (xs, mp, mt) in enumerate(rel):
    plt.plot(xs, mt, "o-", label=f"{REG_ORDER[k]} (emp)")
    plt.plot(xs, mp, "--", alpha=0.6)
plt.plot([0,1],[0,1], "k--", lw=1)
plt.title("Reliability Curve (test)")
plt.xlabel("Predicted probability (bin)"); plt.ylabel("Empirical frequency")
plt.legend(); plt.tight_layout(); plt.savefig(W5 / FN["reliab_png"], dpi=150); plt.close()

# ROC (OvR) - simple line per class using thresholds
from sklearn.metrics import roc_curve, auc as sk_auc
Yb = label_binarize(y_te, classes=list(range(len(REG_ORDER))))
plt.figure(figsize=(6.5,4))
for k in range(len(REG_ORDER)):
    fpr, tpr, _ = roc_curve(Yb[:,k], proba[:,k])
    plt.plot(fpr, tpr, label=f"{REG_ORDER[k]} AUC={sk_auc(fpr,tpr):.3f}")
plt.plot([0,1],[0,1],"k--",lw=1)
plt.title("ROC (OvR) — test")
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.legend(); plt.tight_layout(); plt.savefig(W5 / FN["roc_png"], dpi=150); plt.close()

# Feature importance (if available)
imp = None
if hasattr(clf, "base_estimator_") and hasattr(clf.base_estimator_, "feature_importances_"):
    imp = pd.Series(clf.base_estimator_.feature_importances_, index=num_cols).sort_values(ascending=False)
elif hasattr(clf, "feature_importances_"):
    imp = pd.Series(clf.feature_importances_, index=num_cols).sort_values(ascending=False)

if imp is not None:
    plt.figure(figsize=(6,4))
    imp.head(15).plot(kind="barh"); plt.gca().invert_yaxis()
    plt.title("Feature importance (top 15)")
    plt.tight_layout(); plt.savefig(W5 / FN["imp_png"], dpi=150); plt.close()


Prepare scenario horizon asset returns (same as Week 4)

In [7]:
# Save classifier bundle for later weeks
# Bundle fields kept minimal & stable for Week 6/7 loaders.
pack = {
    "model": clf,
    "feature_cols": num_cols,
    "label_col": "regime_mapped",       # the derived regime
    "regime_order": REG_ORDER,
    "regime_map": reg_map,              # cluster_id -> regime string
    "regime_mix": REGIME_MIX,           # frequency-based mix
    "seed": SEED,
    "versions": VERS,
}

# Use joblib for sklearn object
import joblib
joblib.dump(pack, W5 / FN["model_pkl"])
print("Saved model bundle →", W5 / FN["model_pkl"])

# Save metrics JSON
MET = metrics.copy()
# Make class_report JSON-friendly
MET["class_report"] = {k: {m: float(vv) for m, vv in d.items()} if isinstance(d, dict) else d
                       for k, d in MET["class_report"].items()}
(W5 / FN["metrics_json"]).write_text(json.dumps(MET, indent=2))
print("Saved metrics →", W5 / FN["metrics_json"])

# Final reminder for alpha consistency (from feedback)
print("NOTE: Keep alpha =", ALPHA, "consistently across Week 4–7. Remove any stray overrides (e.g., alpha=1.0).")


Saved model bundle → C:\Users\Krish\OneDrive - Curtin\Desktop\University\University\2025 Sem 2\MATH3004\22155584_Week_5\week5_regime_model.pkl
Saved metrics → C:\Users\Krish\OneDrive - Curtin\Desktop\University\University\2025 Sem 2\MATH3004\22155584_Week_5\week5_metrics.json
NOTE: Keep alpha = 0.95 consistently across Week 4–7. Remove any stray overrides (e.g., alpha=1.0).
