In [None]:
#Imports & Setup
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
# paths
# Notebook is inside Notebooks/, so project root is one level up
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name.lower() == "notebooks" else Path.cwd()

DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

MODEL_TABLE_PATH = DATA_PROCESSED / "eurusd_h1_model_table_rv24h.parquet"
SPLIT_REPORT_PATH = DATA_PROCESSED / "eurusd_h1_walkforward_splits_rv24h.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("MODEL_TABLE_PATH exists:", MODEL_TABLE_PATH.exists())
print("SPLIT_REPORT_PATH exists:", SPLIT_REPORT_PATH.exists())


In [None]:
# Load modeling table
model_df = pd.read_parquet(MODEL_TABLE_PATH).sort_values("timestamp").reset_index(drop=True)

print("Shape:", model_df.shape)
print("Date range:", model_df["timestamp"].min(), "→", model_df["timestamp"].max())
model_df.head()


In [None]:
## Walk-forward splits

We will rebuild splits from the timestamps to ensure we have real index arrays for training/testing.

(We also keep the CSV split report from Notebook 02 for reference.)


In [None]:
#walk-forward split function — tz safe
def make_walk_forward_splits(timestamps, train_months=18, test_months=3, step_months=3):
    ts = pd.to_datetime(timestamps)

    # Ensure tz-aware in UTC (your timestamps are datetime64[ns, UTC])
    if getattr(ts.dt, "tz", None) is None:
        ts = ts.dt.tz_localize("UTC")
    else:
        ts = ts.dt.tz_convert("UTC")

    # Boundaries (month aligned)
    start = ts.min().to_period("M").to_timestamp().tz_localize("UTC")
    end   = ts.max().to_period("M").to_timestamp().tz_localize("UTC")

    splits = []
    current_train_start = start

    while True:
        train_end = current_train_start + pd.DateOffset(months=train_months)
        test_end  = train_end + pd.DateOffset(months=test_months)

        if test_end > end:
            break

        train_mask = (ts >= current_train_start) & (ts < train_end)
        test_mask  = (ts >= train_end) & (ts < test_end)

        train_idx = np.where(train_mask.to_numpy())[0]
        test_idx  = np.where(test_mask.to_numpy())[0]

        if len(train_idx) > 0 and len(test_idx) > 0:
            splits.append((train_idx, test_idx))

        current_train_start = current_train_start + pd.DateOffset(months=step_months)

    return splits

splits = make_walk_forward_splits(model_df["timestamp"], train_months=18, test_months=3, step_months=3)
print("Number of splits:", len(splits))

train_idx, test_idx = splits[0]
print("First split:")
print(" Train:", model_df.loc[train_idx, "timestamp"].min(), "→", model_df.loc[train_idx, "timestamp"].max())
print(" Test: ", model_df.loc[test_idx, "timestamp"].min(), "→", model_df.loc[test_idx, "timestamp"].max())


In [None]:
# Define target + Features
TARGET = "rv_24h"
FEATURES = [c for c in model_df.columns if c not in ["timestamp", TARGET]]

X = model_df[FEATURES]
y = model_df[TARGET]

print("Num features:", len(FEATURES))
print("First 10 features:", FEATURES[:10])


In [None]:
# Evaluation Helper 
def evaluate_on_splits(df, splits, model, features, target):
    rows = []
    preds_all = []  # optional, keep for later analysis

    for i, (train_idx, test_idx) in enumerate(splits):
        X_train = df.loc[train_idx, features]
        y_train = df.loc[train_idx, target]

        X_test = df.loc[test_idx, features]
        y_test = df.loc[test_idx, target]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rows.append({
            "split": i,
            "test_start": df.loc[test_idx, "timestamp"].min(),
            "test_end": df.loc[test_idx, "timestamp"].max(),
            "n_test": len(test_idx),
            "mae": mean_absolute_error(y_test, y_pred),
            "rmse": mean_squared_error(y_test, y_pred, squared=False),
        })

        preds_all.append(pd.DataFrame({
            "timestamp": df.loc[test_idx, "timestamp"].values,
            "y_true": y_test.values,
            "y_pred": y_pred,
            "split": i
        }))

    results = pd.DataFrame(rows)
    preds_all = pd.concat(preds_all, ignore_index=True)
    return results, preds_all


In [None]:
## Ridge regression

Ridge = linear regression with L2 regularization.
We standardize features before fitting (important for regularization).


In [None]:
#Ridge model + results
ridge_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

ridge_results, ridge_preds = evaluate_on_splits(model_df, splits, ridge_model, FEATURES, TARGET)

ridge_results.head()


In [None]:
# Ridge summary
print("Ridge performance (across splits)")
print("MAE  mean:", ridge_results["mae"].mean())
print("MAE   std:", ridge_results["mae"].std())
print("RMSE mean:", ridge_results["rmse"].mean())
print("RMSE  std:", ridge_results["rmse"].std())


In [None]:
## ElasticNet

ElasticNet = linear regression with L1 + L2 regularization.
It can shrink some coefficients to near zero (mild feature selection).


In [None]:
# ElasticNet model + results
enet_model = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNet(alpha=0.01, l1_ratio=0.3, max_iter=5000))
])

enet_results, enet_preds = evaluate_on_splits(model_df, splits, enet_model, FEATURES, TARGET)

enet_results.head()


In [None]:
# ElasticNet summary
print("ElasticNet performance (across splits)")
print("MAE  mean:", enet_results["mae"].mean())
print("MAE   std:", enet_results["mae"].std())
print("RMSE mean:", enet_results["rmse"].mean())
print("RMSE  std:", enet_results["rmse"].std())


In [None]:
# Comparison Table
comparison = pd.DataFrame({
    "model": ["Ridge", "ElasticNet"],
    "mae_mean": [ridge_results["mae"].mean(), enet_results["mae"].mean()],
    "rmse_mean": [ridge_results["rmse"].mean(), enet_results["rmse"].mean()],
    "mae_std": [ridge_results["mae"].std(), enet_results["mae"].std()],
    "rmse_std": [ridge_results["rmse"].std(), enet_results["rmse"].std()],
})

comparison


In [None]:
# Fit Ridge on ALL data for coefficient inspection
ridge_full = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
])

ridge_full.fit(model_df[FEATURES], model_df[TARGET])

coef = ridge_full.named_steps["ridge"].coef_
coef_df = pd.DataFrame({"feature": FEATURES, "coef": coef}).sort_values("coef", ascending=False)

print("Top positive coefficients:")
display(coef_df.head(15))

print("Top negative coefficients:")
display(coef_df.tail(15))


In [None]:
# Save results
out_ridge_results = DATA_PROCESSED / "ridge_walkforward_results.csv"
out_enet_results  = DATA_PROCESSED / "elasticnet_walkforward_results.csv"
out_ridge_preds   = DATA_PROCESSED / "ridge_walkforward_predictions.parquet"
out_enet_preds    = DATA_PROCESSED / "elasticnet_walkforward_predictions.parquet"

ridge_results.to_csv(out_ridge_results, index=False)
enet_results.to_csv(out_enet_results, index=False)

ridge_preds.to_parquet(out_ridge_preds, index=False)
enet_preds.to_parquet(out_enet_preds, index=False)

print("Saved:")
print(" ", out_ridge_results)
print(" ", out_enet_results)
print(" ", out_ridge_preds)
print(" ", out_enet_preds)


In [None]:
## Next: add GARCH-style benchmark (EWMA) and LightGBM

Now that we have clean linear baselines, we will add:
- EWMA/GARCH-style volatility recursion benchmark
- LightGBM gradient boosting model
- A final comparison table across all models


In [None]:
## EWMA (GARCH-style) volatility benchmark

EWMA variance recursion:
    sigma2_t = λ * sigma2_{t-1} + (1-λ) * r_{t-1}^2

This is a classic risk-model baseline (similar spirit to IGARCH/GARCH persistence).

We then forecast forward 24h realized volatility with:
    pred_rv_24h(t) = sqrt(24 * sigma2_t)

and evaluate it with the same walk-forward splits.


In [None]:
# --- EWMA benchmark (NO ret_1h needed) ---
# We smooth the past 24h realized volatility (vol_24h) to produce a forecast for next 24h volatility.

TARGET = "rv_24h"
LAMBDA = 0.94  # smoothing factor (higher = more persistence)

model_df = model_df.sort_values("timestamp").reset_index(drop=True)

# EWMA smoothing:
# ewma_vol_t = λ * ewma_vol_{t-1} + (1-λ) * vol_24h_t
model_df["ewma_vol_24h_pred"] = model_df["vol_24h"].ewm(alpha=(1 - LAMBDA), adjust=False).mean()

print("Created EWMA prediction column: ewma_vol_24h_pred")
model_df[["timestamp", "vol_24h", TARGET, "ewma_vol_24h_pred"]].dropna().head(10)


In [None]:
# --- Evaluate EWMA benchmark on walk-forward splits ---

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

def eval_pred_column(df, splits, y_col, pred_col):
    rows = []
    for i, (train_idx, test_idx) in enumerate(splits):
        y_true = df.loc[test_idx, y_col].to_numpy()
        y_pred = df.loc[test_idx, pred_col].to_numpy()

        # Safety: drop NaNs/infs
        mask = np.isfinite(y_true) & np.isfinite(y_pred)
        y_true = y_true[mask]
        y_pred = y_pred[mask]

        rows.append({
            "split": i,
            "test_start": df.loc[test_idx, "timestamp"].min(),
            "test_end": df.loc[test_idx, "timestamp"].max(),
            "n_test": int(mask.sum()),
            "mae": mean_absolute_error(y_true, y_pred),
            "rmse": mean_squared_error(y_true, y_pred, squared=False),
        })
    return pd.DataFrame(rows)

ewma_results = eval_pred_column(model_df, splits, TARGET, "ewma_vol_24h_pred")

ewma_results.head()


In [None]:
# EWMA summury metrics
print("EWMA(vol_24h) performance (across splits)")
print("MAE  mean:", ewma_results["mae"].mean())
print("MAE   std:", ewma_results["mae"].std())
print("RMSE mean:", ewma_results["rmse"].mean())
print("RMSE  std:", ewma_results["rmse"].std())


In [None]:
# Compare Compare Ridge vs EWMA and ElasticNet
comparison_all = pd.DataFrame([
    {
        "model": "Ridge",
        "mae_mean": ridge_results["mae"].mean(),
        "rmse_mean": ridge_results["rmse"].mean(),
        "mae_std": ridge_results["mae"].std(),
        "rmse_std": ridge_results["rmse"].std(),
    },
    {
        "model": "ElasticNet",
        "mae_mean": enet_results["mae"].mean(),
        "rmse_mean": enet_results["rmse"].mean(),
        "mae_std": enet_results["mae"].std(),
        "rmse_std": enet_results["rmse"].std(),
    },
    {
        "model": f"EWMA(vol_24h, λ={LAMBDA})",
        "mae_mean": ewma_results["mae"].mean(),
        "rmse_mean": ewma_results["rmse"].mean(),
        "mae_std": ewma_results["mae"].std(),
        "rmse_std": ewma_results["rmse"].std(),
    },
]).sort_values("mae_mean")

comparison_all


In [None]:
#Save EWMA results
out_ewma_results = DATA_PROCESSED / f"ewma_vol24h_lambda_{str(LAMBDA).replace('.','_')}_results.csv"
ewma_results.to_csv(out_ewma_results, index=False)
print("Saved EWMA split results to:", out_ewma_results)


In [None]:
## LightGBM (Gradient Boosting)

Goal:
- Train a non-linear model to predict `rv_24h`
- Evaluate strictly using the same walk-forward splits
- Compare against Ridge and EWMA

Why LightGBM:
- learns non-linear relationships and interactions (regimes)
- performs very well on tabular financial features
- robust and fast


In [None]:
# Install/Import
try:
    import lightgbm as lgb
    print("LightGBM version:", lgb.__version__)
except ModuleNotFoundError:
    !pip -q install lightgbm
    import lightgbm as lgb
    print("LightGBM installed. Version:", lgb.__version__)


In [None]:
# define features for LightGBM
TARGET = "rv_24h"

EXCLUDE = {"timestamp", TARGET, "ewma_vol_24h_pred"}  # add more if you created other helper columns
LGBM_FEATURES = [c for c in model_df.columns if c not in EXCLUDE]

print("Num LightGBM features:", len(LGBM_FEATURES))
print("First 15 features:", LGBM_FEATURES[:15])


In [None]:
##train + evaluate LightGBM with walk-forward splits
from sklearn.metrics import mean_absolute_error, mean_squared_error

def train_eval_lightgbm(df, splits, features, target):
    rows = []
    preds_all = []

    for i, (train_idx, test_idx) in enumerate(splits):
        X_train = df.loc[train_idx, features]
        y_train = df.loc[train_idx, target]
        X_test  = df.loc[test_idx, features]
        y_test  = df.loc[test_idx, target]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_test, label=y_test, reference=dtrain)

        params = {
    "objective": "regression",
    "metric": "l1",
    "learning_rate": 0.03,
    "num_leaves": 63,          # ↓ from 63            
    "min_data_in_leaf": 300,   # ↑ from 300
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,         # ↑ stronger regularization
    "verbosity": -1,
    "seed": 42,
}


        booster = lgb.train(
            params,
            dtrain,
            num_boost_round=5000,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
        )

        y_pred = booster.predict(X_test, num_iteration=booster.best_iteration)

        mae = mean_absolute_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)

        rows.append({
            "split": i,
            "test_start": df.loc[test_idx, "timestamp"].min(),
            "test_end": df.loc[test_idx, "timestamp"].max(),
            "n_test": len(test_idx),
            "best_iter": booster.best_iteration,
            "mae": mae,
            "rmse": rmse,
        })

        preds_all.append(pd.DataFrame({
            "timestamp": df.loc[test_idx, "timestamp"].values,
            "y_true": y_test.values,
            "y_pred": y_pred,
            "split": i,
        }))

    return pd.DataFrame(rows), pd.concat(preds_all, ignore_index=True)

lgbm_results, lgbm_preds = train_eval_lightgbm(model_df, splits, LGBM_FEATURES, TARGET)

lgbm_results.head()


In [None]:
#LightGBM summary
print("LightGBM performance (across splits)")
print("MAE  mean:", lgbm_results["mae"].mean())
print("MAE   std:", lgbm_results["mae"].std())
print("RMSE mean:", lgbm_results["rmse"].mean())
print("RMSE  std:", lgbm_results["rmse"].std())
print("Median best_iter:", int(lgbm_results["best_iter"].median()))


In [None]:
#final comparison table: Ridge vs EWMA vs LightGBM
final_comparison = pd.DataFrame([
    {
        "model": "Ridge",
        "mae_mean": ridge_results["mae"].mean(),
        "rmse_mean": ridge_results["rmse"].mean(),
        "mae_std": ridge_results["mae"].std(),
        "rmse_std": ridge_results["rmse"].std(),
    },
    {
        "model": f"EWMA(vol_24h, λ={LAMBDA})",
        "mae_mean": ewma_results["mae"].mean(),
        "rmse_mean": ewma_results["rmse"].mean(),
        "mae_std": ewma_results["mae"].std(),
        "rmse_std": ewma_results["rmse"].std(),
    },
    {
        "model": "LightGBM",
        "mae_mean": lgbm_results["mae"].mean(),
        "rmse_mean": lgbm_results["rmse"].mean(),
        "mae_std": lgbm_results["mae"].std(),
        "rmse_std": lgbm_results["rmse"].std(),
    },
]).sort_values("mae_mean")

final_comparison


In [None]:
# Train a final model on last split for feature importance
last_train_idx, last_test_idx = splits[-1]

X_train = model_df.loc[last_train_idx, LGBM_FEATURES]
y_train = model_df.loc[last_train_idx, TARGET]

dtrain = lgb.Dataset(X_train, label=y_train)

params = {
    "objective": "regression",
    "metric": "l1",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 300,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42,
}

best_rounds = int(lgbm_results["best_iter"].median())
booster = lgb.train(params, dtrain, num_boost_round=best_rounds)

imp = pd.DataFrame({
    "feature": LGBM_FEATURES,
    "gain": booster.feature_importance(importance_type="gain"),
    "split": booster.feature_importance(importance_type="split"),
}).sort_values("gain", ascending=False)

imp.head(20)


In [None]:
#save results 
out_lgbm_results = DATA_PROCESSED / "lightgbm_walkforward_results.csv"
out_lgbm_preds   = DATA_PROCESSED / "lightgbm_walkforward_predictions.parquet"

lgbm_results.to_csv(out_lgbm_results, index=False)
lgbm_preds.to_parquet(out_lgbm_preds, index=False)

print("Saved:")
print(" ", out_lgbm_results)
print(" ", out_lgbm_preds)


In [None]:
# Summary bar chart (MAE & RMSE)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Create plots directory if it does not exist
PLOTS_DIR = "plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

# Build a summary table (use your existing results)
summary = pd.DataFrame([
    {
        "model": "Ridge",
        "mae_mean": ridge_results["mae"].mean(),
        "rmse_mean": ridge_results["rmse"].mean(),
        "mae_std": ridge_results["mae"].std(),
        "rmse_std": ridge_results["rmse"].std(),
    },
    {
        "model": f"EWMA (λ={LAMBDA})",
        "mae_mean": ewma_results["mae"].mean(),
        "rmse_mean": ewma_results["rmse"].mean(),
        "mae_std": ewma_results["mae"].std(),
        "rmse_std": ewma_results["rmse"].std(),
    },
    {
        "model": "LightGBM",
        "mae_mean": lgbm_results["mae"].mean(),
        "rmse_mean": lgbm_results["rmse"].mean(),
        "mae_std": lgbm_results["mae"].std(),
        "rmse_std": lgbm_results["rmse"].std(),
    },
]).sort_values("mae_mean")

# --- Bar chart with error bars ---
plt.figure(figsize=(10, 5))

x = np.arange(len(summary))
width = 0.38

plt.bar(
    x - width/2,
    summary["mae_mean"],
    width,
    yerr=summary["mae_std"],
    capsize=4,
    label="MAE (mean ± std)"
)
plt.bar(
    x + width/2,
    summary["rmse_mean"],
    width,
    yerr=summary["rmse_std"],
    capsize=4,
    label="RMSE (mean ± std)"
)

plt.xticks(x, summary["model"], rotation=0)
plt.ylabel("Error")
plt.title("EURUSD 24h Volatility Forecasting — Model Comparison (Walk-forward CV)")
plt.legend()
plt.tight_layout()

# --- Save ---
barplot_path = os.path.join(PLOTS_DIR, "model_comparison_mae_rmse.png")
plt.savefig(barplot_path, dpi=200, bbox_inches="tight")
plt.show()

print(f"Bar plot saved to: {barplot_path}")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os

# Create plots directory if it does not exist
PLOTS_DIR = "plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

SPLIT_TO_PLOT = 0

true_df = ridge_preds[ridge_preds["split"] == SPLIT_TO_PLOT][["timestamp","y_true"]].rename(
    columns={"y_true":"True RV_24h"}
)
ridge_df = ridge_preds[ridge_preds["split"] == SPLIT_TO_PLOT][["timestamp","y_pred"]].rename(
    columns={"y_pred":"Ridge"}
)
lgbm_df  = lgbm_preds[lgbm_preds["split"] == SPLIT_TO_PLOT][["timestamp","y_pred"]].rename(
    columns={"y_pred":"LightGBM"}
)
ewma_df  = model_df[["timestamp","ewma_vol_24h_pred"]].rename(
    columns={"ewma_vol_24h_pred":f"EWMA (λ={LAMBDA})"}
)

# Fix timezone mismatch BEFORE merging
true_df["timestamp"]  = pd.to_datetime(true_df["timestamp"], utc=True)
ridge_df["timestamp"] = pd.to_datetime(ridge_df["timestamp"], utc=True)
lgbm_df["timestamp"]  = pd.to_datetime(lgbm_df["timestamp"], utc=True)
ewma_df["timestamp"]  = pd.to_datetime(ewma_df["timestamp"], utc=True)

plot_df = (
    true_df
    .merge(ridge_df, on="timestamp", how="left")
    .merge(lgbm_df, on="timestamp", how="left")
    .merge(ewma_df, on="timestamp", how="left")
    .sort_values("timestamp")
)

plt.figure(figsize=(12, 5))
plt.plot(plot_df["timestamp"], plot_df["True RV_24h"], linewidth=2, label="True RV_24h")
plt.plot(plot_df["timestamp"], plot_df["Ridge"], linewidth=1.5, label="Ridge")
plt.plot(plot_df["timestamp"], plot_df["LightGBM"], linewidth=1.5, label="LightGBM")
plt.plot(
    plot_df["timestamp"],
    plot_df[f"EWMA (λ={LAMBDA})"],
    linewidth=1.5,
    label=f"EWMA (λ={LAMBDA})"
)

plt.title("EURUSD — True vs Predicted 24h Realized Volatility (Out-of-Sample)")
plt.xlabel("Time")
plt.ylabel("Volatility")
plt.legend()
plt.tight_layout()

# --- Save ---
tsplot_path = os.path.join(PLOTS_DIR, "volatility_forecast_timeseries.png")
plt.savefig(tsplot_path, dpi=200, bbox_inches="tight")
plt.show()

print(f"Time-series plot saved to: {tsplot_path}")
