In [None]:
# 02 — Volatility Target + Features (EURUSD H1)

Goal:
- Load processed EURUSD hourly data
- Build forward realized volatility targets
- Build lag/rolling features
- Create walk-forward validation splits (no leakage)


In [None]:
# Imports + paths 
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 140)

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name.lower() == "notebooks" else Path.cwd()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

in_path = DATA_PROCESSED / "eurusd_h1_kaggle.parquet"
print("Loading:", in_path)


In [None]:
# Load data 
df = pd.read_parquet(in_path)

# Ensure sorted and unique timestamps
df = df.sort_values("timestamp").drop_duplicates("timestamp").reset_index(drop=True)

print("Shape:", df.shape)
print("Date range:", df["timestamp"].min(), "→", df["timestamp"].max())
df.head()


In [None]:
## Define volatility forecasting target

We forecast forward realized volatility (RV) over a horizon H hours:

- returns: ret_1h (log return)
- target at time t: RV_{t,H} = std(ret_{t+1} ... ret_{t+H}) * sqrt(H)

We shift by 1 to prevent leakage (we do NOT use ret at time t in the future window).


In [None]:
# Build targets for one or more horizons
HORIZONS = [24]  # 24 hours = 1 day of hourly bars; later you can add 48, 72, 168

for H in HORIZONS:
    # std of the NEXT H returns (shift by -1 so window starts at t+1)
    fwd_std = df["ret_1h"].shift(-1).rolling(window=H).std()
    df[f"rv_{H}h"] = fwd_std * np.sqrt(H)

df[["timestamp", "ret_1h"] + [f"rv_{H}h" for H in HORIZONS]].tail(10)


In [None]:
## Build features (past information only)

We create:
- lagged returns
- rolling realized volatility (past)
- rolling statistics of absolute returns
- time features (hour-of-day, day-of-week)


In [None]:
# Feature Engineering 
df_feat = df.copy()

# --- time features ---
df_feat["hour"] = df_feat["timestamp"].dt.hour
df_feat["dow"] = df_feat["timestamp"].dt.dayofweek  # 0=Mon

# --- lagged returns ---
LAGS = [1, 2, 3, 6, 12, 24]
for l in LAGS:
    df_feat[f"ret_lag_{l}"] = df_feat["ret_1h"].shift(l)

# --- rolling volatility features (past) ---
ROLL_WINDOWS = [6, 12, 24, 72, 168]  # 6h, 12h, 1d, 3d, 1w
for w in ROLL_WINDOWS:
    df_feat[f"vol_{w}h"] = df_feat["ret_1h"].rolling(w).std() * np.sqrt(w)
    df_feat[f"absret_mean_{w}h"] = df_feat["ret_1h"].abs().rolling(w).mean()
    df_feat[f"absret_max_{w}h"] = df_feat["ret_1h"].abs().rolling(w).max()

# Optional: range-based feature using OHLC (past hour)
# (high-low)/close gives a simple intrabar range measure
df_feat["hl_range"] = (df_feat["high"] - df_feat["low"]) / df_feat["close"]

df_feat[["timestamp","ret_1h","hour","dow","vol_24h","hl_range"]].head(30)


In [None]:
# Build modeling table 
TARGET = "rv_24h"

feature_cols = (
    ["hour", "dow", "hl_range"]
    + [f"ret_lag_{l}" for l in LAGS]
    + [f"vol_{w}h" for w in ROLL_WINDOWS]
    + [f"absret_mean_{w}h" for w in ROLL_WINDOWS]
    + [f"absret_max_{w}h" for w in ROLL_WINDOWS]
)

model_df = df_feat[["timestamp", TARGET] + feature_cols].dropna().reset_index(drop=True)

print("Modeling table shape:", model_df.shape)
print("Date range:", model_df["timestamp"].min(), "→", model_df["timestamp"].max())
model_df.head()


In [None]:
## Walk-forward splits

We create time-based splits:
- Train up to a date
- Validate on the next chunk

This avoids look-ahead bias and matches real trading/risk forecasting.


In [None]:
import numpy as np
import pandas as pd

def make_walk_forward_splits(timestamps, train_months=18, test_months=3, step_months=3):
    """
    Walk-forward splits for tz-aware timestamps (UTC).
    Returns list of (train_idx, test_idx).
    """
    ts = pd.to_datetime(timestamps)

    # ✅ Ensure tz-aware in UTC
    if ts.dt.tz is None:
        ts = ts.dt.tz_localize("UTC")
    else:
        ts = ts.dt.tz_convert("UTC")

    # ✅ Build timezone-aware boundaries
    start = ts.min().to_period("M").to_timestamp().tz_localize("UTC")
    end   = ts.max().to_period("M").to_timestamp().tz_localize("UTC")

    splits = []
    current_train_start = start

    while True:
        train_end = current_train_start + pd.DateOffset(months=train_months)
        test_end  = train_end + pd.DateOffset(months=test_months)

        if test_end > end:
            break

        train_mask = (ts >= current_train_start) & (ts < train_end)
        test_mask  = (ts >= train_end) & (ts < test_end)

        train_idx = np.where(train_mask.to_numpy())[0]
        test_idx  = np.where(test_mask.to_numpy())[0]

        if len(train_idx) > 0 and len(test_idx) > 0:
            splits.append((train_idx, test_idx))

        current_train_start = current_train_start + pd.DateOffset(months=step_months)

    return splits

splits = make_walk_forward_splits(model_df["timestamp"], train_months=18, test_months=3, step_months=3)
print("Number of splits:", len(splits))

train_idx, test_idx = splits[0]
print("First split:")
print(" Train:", model_df.loc[train_idx, "timestamp"].min(), "→", model_df.loc[train_idx, "timestamp"].max())
print(" Test: ", model_df.loc[test_idx, "timestamp"].min(), "→", model_df.loc[test_idx, "timestamp"].max())


In [None]:
print(model_df["timestamp"].dtype)
print(model_df["timestamp"].head(2))


In [None]:
## Baseline evaluation (quick)

Before ML, test a strong baseline:
- Predict future volatility using past 24h volatility (vol_24h)
This sets a benchmark to beat.


In [None]:
#baseline evaluation on walk-forward splits
from sklearn.metrics import mean_absolute_error, mean_squared_error

TARGET = "rv_24h"
BASELINE_COL = "vol_24h"

def eval_over_splits(df, splits, y_col, yhat_col):
    rows = []
    for i, (train_idx, test_idx) in enumerate(splits):
        y_true = df.loc[test_idx, y_col].to_numpy()
        y_pred = df.loc[test_idx, yhat_col].to_numpy()

        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)

        rows.append({
            "split": i,
            "test_start": df.loc[test_idx, "timestamp"].min(),
            "test_end": df.loc[test_idx, "timestamp"].max(),
            "n_test": len(test_idx),
            "mae": mae,
            "rmse": rmse
        })
    return pd.DataFrame(rows)

baseline_results = eval_over_splits(model_df, splits, TARGET, BASELINE_COL)

baseline_results


In [None]:
#summary + save artifacts
print("Baseline summary across splits")
print("MAE  mean:", baseline_results["mae"].mean())
print("MAE   std:", baseline_results["mae"].std())
print("RMSE mean:", baseline_results["rmse"].mean())
print("RMSE  std:", baseline_results["rmse"].std())

# Save the modeling table (features + target)
out_model_path = DATA_PROCESSED / "eurusd_h1_model_table_rv24h.parquet"
model_df.to_parquet(out_model_path, index=False)

# Save split info (as dates + counts, not raw indices)
out_splits_path = DATA_PROCESSED / "eurusd_h1_walkforward_splits_rv24h.csv"
baseline_results.to_csv(out_splits_path, index=False)

print("\nSaved modeling table to:", out_model_path)
print("Saved split report to:", out_splits_path)
