In [211]:
# Yahoo Finance Data Acquisition of Required Data
#Target Period: 2006-01-01 to 2025-06-30

#Imports
#Import Yahoo Finance data library for data acquisition
import pandas as pd
import numpy as np
import yfinance as yf

In [213]:
#Define instruments and Date range
start = "2006-01-01"
end = "2025-06-30"

# S&P500 ETF (proxy)
df_spx_raw = yf.download("SPY", start=start, end=end)

# Dollar Index (Yahoo ticker: DX-Y.NYB)
df_dxy = yf.download("DX-Y.NYB", start=start, end=end)

# WTI Oil Futures (Yahoo ticker: CL=F)
df_wti = yf.download("CL=F", start=start, end=end)

print(df_spx_raw.head())
print(df_dxy.head())
print(df_wti.head())

  df_spx_raw = yf.download("SPY", start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df_dxy = yf.download("DX-Y.NYB", start=start, end=end)
[*********************100%***********************]  1 of 1 completed
  df_wti = yf.download("CL=F", start=start, end=end)
[*********************100%***********************]  1 of 1 completed

Price           Close       High        Low       Open    Volume
Ticker            SPY        SPY        SPY        SPY       SPY
Date                                                            
2006-01-03  87.721710  87.929420  86.122366  86.676254  73256700
2006-01-04  88.137123  88.268667  87.721704  87.832484  51899600
2006-01-05  88.192520  88.337915  87.846341  88.033281  47307500
2006-01-06  88.926437  89.023367  88.178690  88.635648  62885900
2006-01-09  89.154915  89.355694  88.884896  88.912585  43527400
Price           Close       High        Low       Open   Volume
Ticker       DX-Y.NYB   DX-Y.NYB   DX-Y.NYB   DX-Y.NYB DX-Y.NYB
Date                                                           
2006-01-03  89.839996  90.940002  89.779999  90.750000        0
2006-01-04  89.139999  89.860001  89.010002  89.730003        0
2006-01-05  89.330002  89.629997  89.180000  89.269997        0
2006-01-06  88.849998  89.629997  88.800003  89.349998        0
2006-01-09  89.250000  89.449997




# Data Preprocesssing & Transformation

In [216]:
#Import required libraries
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller

#Load and confirm the raw datasets
print("SPY shape:", df_spx_raw.shape)
print("DXY shape:", df_dxy.shape)
print("WTI shape:", df_wti.shape)

#Standardise column names and isolate 'Close' column
df_spx = df_spx_raw[['Close']].rename(columns={'Close': 'SPY_Close'})
df_dxy = df_dxy[['Close']].rename(columns={'Close': 'DXY_Close'})
df_wti = df_wti[['Close']].rename(columns={'Close': 'WTI_Close'})

#Merge and clean datasets
df = df_spx.join([df_dxy, df_wti], how='inner')
df = df.dropna()
print("Merged dataframe shape:", df.shape)

#Compute returns and changes
df['SPY_log_ret'] = np.log(df['SPY_Close'] / df['SPY_Close'].shift(1)) * 100
df['SPY_price_change'] = df['SPY_Close'].diff()
df['SPY_p_change'] = df['SPY_Close'].pct_change() * 100

df['DXY_ret'] = np.log(df['DXY_Close'] / df['DXY_Close'].shift(1)) * 100
df['WTI_ret'] = np.log(df['WTI_Close'] / df['WTI_Close'].shift(1)) * 100

df = df.dropna()

SPY shape: (4902, 5)
DXY shape: (4905, 5)
WTI shape: (4901, 5)
Merged dataframe shape: (4898, 3)


  result = func(self.values, **kwargs)


# Time Series Splitting & Validation

Static Chronological Split

In [220]:
# ================================================================
# 3.2A STATIC CHRONOLOGICAL SPLIT
# ================================================================

# Function for chronological splitting of time series data
def chronological_split(df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Chronologically splits the dataset into train, validation, and test sets.
    Ensures no look-ahead bias by using the time order of observations.

    Parameters:
    - df: pandas DataFrame with DateTime index
    - train_ratio: float, proportion of training data
    - val_ratio: float, proportion of validation data
    - test_ratio: float, proportion of testing data

    Returns:
    - train_df, val_df, test_df
    """
    # Validate ratios
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-9, "Ratios must sum to 1."

    n = len(df)
    i_train_end = int(n * train_ratio)
    i_val_end = int(n * (train_ratio + val_ratio))

    train_df = df.iloc[:i_train_end].copy()
    val_df = df.iloc[i_train_end:i_val_end].copy()
    test_df = df.iloc[i_val_end:].copy()

    print(f"Total samples: {n}")
    print(f"Train: {len(train_df)} ({len(train_df)/n:.1%}), "
          f"Validation: {len(val_df)} ({len(val_df)/n:.1%}), "
          f"Test: {len(test_df)} ({len(test_df)/n:.1%})")

    return train_df, val_df, test_df


# ---------------------------------------------------------------
# Apply chronological split to your processed dataframe
# ---------------------------------------------------------------
train_df, val_df, test_df = chronological_split(df, 0.8, 0.1, 0.1)

# Confirm the split visually
print("\nTrain range:", train_df.index.min(), "→", train_df.index.max())
print("Validation range:", val_df.index.min(), "→", val_df.index.max())
print("Test range:", test_df.index.min(), "→", test_df.index.max())

# Display sample data from each set
print("\nSample from training data:")
print(train_df.tail())

print("\nSample from validation data:")
print(val_df.head())

print("\nSample from test data:")
print(test_df.head())

Total samples: 4895
Train: 3916 (80.0%), Validation: 489 (10.0%), Test: 490 (10.0%)

Train range: 2006-01-04 00:00:00 → 2021-08-03 00:00:00
Validation range: 2021-08-04 00:00:00 → 2023-07-14 00:00:00
Test range: 2023-07-17 00:00:00 → 2025-06-27 00:00:00

Sample from training data:
Price        SPY_Close  DXY_Close  WTI_Close SPY_log_ret SPY_price_change  \
Ticker             SPY   DX-Y.NYB       CL=F                                
Date                                                                        
2021-07-28  413.541992  92.320000  72.389999   -0.041022        -0.169678   
2021-07-29  415.257141  91.860001  73.620003    0.413888         1.715149   
2021-07-30  413.240479  92.169998  73.949997   -0.486825        -2.016663   
2021-08-02  412.373474  92.050003  71.260002   -0.210027        -0.867004   
2021-08-03  415.728271  92.080002  70.559998    0.810242         3.354797   

Price      SPY_p_change   DXY_ret   WTI_ret  
Ticker                                       
Date     

Create functions: AWT and Indicators, scaling and rolling windows

In [223]:
# ================================================================
# ✅ FIXED: Walk-Forward Validation with AWT Features + Scaling
# ================================================================
import pywt
import ta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# ================================================================
# 1️⃣  Utility: Flatten MultiIndex columns
# ================================================================
def flatten_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure single-level, clean column names."""
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            '_'.join([str(c) for c in tup if c is not None]).strip()
            for tup in df.columns
        ]
    else:
        df.columns = [str(c).strip() for c in df.columns]
    # Remove duplicates if flattening causes any
    df = df.loc[:, ~pd.Index(df.columns).duplicated()]
    return df


# ================================================================
# 2️⃣  Wavelet Denoising Helper
# ================================================================
def wavelet_denoise_series(x: pd.Series,
                           wavelet: str = "db2",
                           level: int = 2,
                           mode: str = "soft") -> pd.DataFrame:
    """
    Simple wavelet decompose + denoise returning smooth & detail as DataFrame.
    """
    x = pd.Series(x).astype(float)
    coeffs = pywt.wavedec(x.values, wavelet=wavelet, level=level)
    # Universal threshold based on noise estimation
    sigma = np.median(np.abs(coeffs[-1])) / 0.6745 if len(coeffs[-1]) else 0.0
    thr = sigma * np.sqrt(2 * np.log(len(x)))
    coeffs_thr = [coeffs[0]] + [pywt.threshold(c, thr, mode=mode) for c in coeffs[1:]]
    x_s = pywt.waverec(coeffs_thr, wavelet)[:len(x)]  # smoothed component
    x_d = x.values[:len(x_s)] - x_s  # detail component
    return pd.DataFrame({"SPY_s": x_s, "SPY_d": x_d}, index=x.index)


In [225]:
# ================================================================
# 3️⃣  Compute AWT + Features + Target
# ================================================================
# ------------------------------------------------
# Robust AWT + features + target (auto-make SPY_log_ret if missing)
# ------------------------------------------------
def _first_present(colnames, df_cols):
    for c in colnames:
        if c in df_cols:
            return c
    return None

def compute_awt_features(df_slice: pd.DataFrame) -> pd.DataFrame:
    """
    Build AWT-based features + 21 indicators safely.
    Requires SPY OHLCV if available; will fall back where possible.
    Produces:
      SPY_s, SPY_d, and 21 indicators incl. MACD, RSI, ADX, OBV, MFI, WVAD, etc.
    """
    # -------- helpers --------
    def _flatten(df):
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ['_'.join([str(c) for c in tup if c is not None]).strip()
                          for tup in df.columns]
        else:
            df.columns = [str(c).strip() for c in df.columns]
        return df.loc[:, ~pd.Index(df.columns).duplicated()]

    def _find_col(df, *must_contain):
        must = [m.lower() for m in must_contain]
        for c in df.columns:
            lc = str(c).lower()
            if all(m in lc for m in must):
                return c
        # simple fallbacks for common single-level names
        lut = {
            ('spy', 'open'):   ['SPY_Open', 'Open', 'Open_SPY'],
            ('spy', 'high'):   ['SPY_High', 'High', 'High_SPY'],
            ('spy', 'low'):    ['SPY_Low', 'Low', 'Low_SPY'],
            ('spy', 'close'):  ['SPY_Close', 'Close', 'Close_SPY'],
            ('spy', 'volume'): ['SPY_Volume', 'Volume', 'Volume_SPY'],
        }
        for k, alts in lut.items():
            if set(k) == set(must):
                for a in alts:
                    if a in df.columns:
                        return a
        return None

    def _wavelet_denoise(x, wavelet="db2", level=2, mode="soft"):
        x = pd.Series(x).astype(float)
        coeffs = pywt.wavedec(x.values, wavelet=wavelet, level=level)
        sigma = np.median(np.abs(coeffs[-1])) / 0.6745 if len(coeffs[-1]) else 0.0
        thr   = sigma * np.sqrt(2 * np.log(len(x)))
        coeffs_thr = [coeffs[0]] + [pywt.threshold(c, thr, mode=mode) for c in coeffs[1:]]
        x_s = pywt.waverec(coeffs_thr, wavelet)[:len(x)]
        x_d = x.values[:len(x_s)] - x_s
        return pd.DataFrame({"SPY_s": x_s, "SPY_d": x_d}, index=x.index)

    # -------- start --------
    df_f = _flatten(df_slice.copy())

    # Locate OHLCV columns (best effort)
    c_open   = _find_col(df_f, 'spy', 'open')
    c_high   = _find_col(df_f, 'spy', 'high')
    c_low    = _find_col(df_f, 'spy', 'low')
    c_close  = _find_col(df_f, 'spy', 'close')
    c_volume = _find_col(df_f, 'spy', 'volume')

    # Ensure log returns exist (needed for AWT)
    if 'SPY_log_ret' not in df_f.columns:
        if c_close is None:
            raise KeyError("compute_awt_features() needs SPY close prices to derive SPY_log_ret.")
        df_f['SPY_log_ret'] = np.log(df_f[c_close] / df_f[c_close].shift(1)) * 100

    # AWT on SPY_log_ret -> SPY_s (smooth), SPY_d (detail)
    s_hat = _wavelet_denoise(df_f['SPY_log_ret'])
    df_f = df_f.join(s_hat, how='left')

    # Choose price series for TA (prefer denoised)
    price = df_f['SPY_s'].fillna(df_f['SPY_log_ret'])

    # ---------------- 21 INDICATORS ----------------
    # 1–3 Trend MAs
    df_f['EMA20'] = ta.trend.EMAIndicator(close=price, window=20).ema_indicator()
    df_f['MA5']   = ta.trend.SMAIndicator(close=price, window=5).sma_indicator()
    df_f['MA10']  = ta.trend.SMAIndicator(close=price, window=10).sma_indicator()

    # 4 MACD
    df_f['MACD']  = ta.trend.MACD(close=price).macd()

    # 5 ROC(10)
    df_f['ROC']   = ta.momentum.ROCIndicator(close=price, window=10).roc()

    # 6–7 Momentum (6m, 12m)
    df_f['MTM6']  = price / price.shift(126) - 1
    df_f['MTM12'] = price / price.shift(252) - 1

    # 8 SMI (stochastic momentum index)
    if c_high and c_low:
        stoch = ta.momentum.StochasticOscillator(high=df_f[c_high], low=df_f[c_low],
                                                 close=price, window=14, smooth_window=3)
        df_f['SMI'] = stoch.stoch_signal()
    else:
        df_f['SMI'] = np.nan

    # 9 ATR(14)
    df_f['ATR'] = (ta.volatility.AverageTrueRange(
        high=df_f[c_high] if c_high else price,
        low=df_f[c_low] if c_low else price,
        close=price, window=14
    ).average_true_range())

    # 10–12 Bollinger
    boll = ta.volatility.BollingerBands(close=price, window=20, window_dev=2)
    df_f['BOLL_upper']     = boll.bollinger_hband()
    df_f['BOLL_lower']     = boll.bollinger_lband()
    df_f['BOLL_bandwidth'] = df_f['BOLL_upper'] - df_f['BOLL_lower']

    # 13 CCI(20)
    if c_high and c_low and c_close:
        df_f['CCI'] = ta.trend.CCIIndicator(
            high=df_f[c_high], low=df_f[c_low], close=df_f[c_close], window=20
        ).cci()
    else:
        df_f['CCI'] = np.nan

    # 14 WVAD (needs OHLCV)
    if c_open and c_high and c_low and c_close and c_volume:
        rng = (df_f[c_high] - df_f[c_low]).replace(0, np.nan)
        df_f['WVAD'] = ((df_f[c_close] - df_f[c_open]) / (rng + 1e-10)) * df_f[c_volume]
    else:
        df_f['WVAD'] = np.nan

    # 15 RSI(14)
    df_f['RSI14'] = ta.momentum.RSIIndicator(close=price, window=14).rsi()

    # 16–17 Stochastic %K/%D (classic)
    if c_high and c_low:
        st = ta.momentum.StochasticOscillator(high=df_f[c_high], low=df_f[c_low],
                                              close=price, window=14, smooth_window=3)
        df_f['STO_K'] = st.stoch()
        df_f['STO_D'] = st.stoch_signal()
    else:
        df_f[['STO_K', 'STO_D']] = np.nan

    # 18 ADX(14)
    if c_high and c_low and c_close:
        df_f['ADX14'] = ta.trend.ADXIndicator(
            high=df_f[c_high], low=df_f[c_low], close=df_f[c_close], window=14
        ).adx()
    else:
        df_f['ADX14'] = np.nan

    # 19 OBV
    if c_close and c_volume:
        df_f['OBV'] = ta.volume.OnBalanceVolumeIndicator(
            close=df_f[c_close], volume=df_f[c_volume]
        ).on_balance_volume()
    else:
        df_f['OBV'] = np.nan

    # 20 MFI(14)
    if c_high and c_low and c_close and c_volume:
        df_f['MFI14'] = ta.volume.MFIIndicator(
            high=df_f[c_high], low=df_f[c_low],
            close=df_f[c_close], volume=df_f[c_volume], window=14
        ).money_flow_index()
    else:
        df_f['MFI14'] = np.nan

    # 21 Williams %R(14)
    if c_high and c_low and c_close:
        df_f['WILLR14'] = ta.momentum.WilliamsRIndicator(
            high=df_f[c_high], low=df_f[c_low], close=df_f[c_close], lbp=14
        ).williams_r()
    else:
        df_f['WILLR14'] = np.nan

    # Target (example: next-day log return)
    df_f['y_1d'] = df_f['SPY_log_ret'].shift(-1)

    # Final clean (drop rows introduced by windows/shifts)
    return df_f.dropna()


In [227]:
# ================================================================
# 4️⃣  Scaling Helper
# ================================================================
def fit_transform_scale(X_train, X_val=None, X_test=None):
    """
    Fit StandardScaler on training set; transform others with same parameters.
    """
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(X_train)
    Xva = scaler.transform(X_val) if X_val is not None else None
    Xte = scaler.transform(X_test) if X_test is not None else None
    return Xtr, Xva, Xte, scaler


# ================================================================
# 5️⃣  Rolling Window Generator
# ================================================================
def walk_forward_windows(df, train_span, eval_span, step):
    """
    Generator producing chronological (train_df, eval_df) pairs.
    """
    n = len(df)
    start = 0
    while True:
        tr_start = start
        tr_end   = tr_start + train_span
        ev_end   = tr_end + eval_span
        if ev_end > n:
            break
        yield df.iloc[tr_start:tr_end].copy(), df.iloc[tr_end:ev_end].copy()
        start += step


In [233]:
# ================================================================
# 6️⃣  Walk-Forward Execution Loop
# ================================================================
# --- REPLACE your walk-forward loop with this safer version ---
TRAIN_SPAN = 1500      # ~6 years
EVAL_SPAN  = 60        # ~3 months
STEP       = 20        # ~1 month shift

feature_cols = None
target_col   = 'y_1d'

MIN_TR_ROWS  = 100   # guard: minimum rows required after dropna
MIN_EV_ROWS  = 10

all_preds, all_truth = [], []

made_plots = False  # Track if we’ve already plotted

for w, (df_tr_raw, df_ev_raw) in enumerate(walk_forward_windows(df, TRAIN_SPAN, EVAL_SPAN, STEP), start=1):
    df_tr_raw = flatten_cols(df_tr_raw)
    df_ev_raw = flatten_cols(df_ev_raw)

    # Build features on combined slice
    df_comb       = pd.concat([df_tr_raw, df_ev_raw], axis=0)
    df_feat_comb  = compute_awt_features(df_comb)

    # Split back to train/eval by index, THEN drop NaNs
    tr_end_idx = df_tr_raw.index[-1]
    df_feat_tr = df_feat_comb.loc[:tr_end_idx].dropna()
    df_feat_ev = df_feat_comb.loc[df_ev_raw.index[0]:df_ev_raw.index[-1]].dropna()

    # Window too small? (happens in first windows due to long lookbacks)
    if len(df_feat_tr) < MIN_TR_ROWS or len(df_feat_ev) < MIN_EV_ROWS:
        # optional: print(f"Skipping window {w}: train={len(df_feat_tr)}, eval={len(df_feat_ev)}")
        continue

    # Choose features once (exclude raw returns & targets)
    if feature_cols is None:
        exclude = {'SPY_log_ret', 'DXY_ret', 'WTI_ret'} | {c for c in df_feat_tr.columns if c.startswith('y_')}
        feature_cols = [c for c in df_feat_tr.columns if c not in exclude]

    X_tr = df_feat_tr[feature_cols].values
    y_tr = df_feat_tr[target_col].values
    X_ev = df_feat_ev[feature_cols].values
    y_ev = df_feat_ev[target_col].values

    # Another guard (just in case)
    if X_tr.shape[0] == 0 or X_ev.shape[0] == 0:
        # optional: print(f"Skipping window {w}: empty matrix after selection")
        continue

    # Scale using train stats only
    X_tr_s, _, X_ev_s, scaler = fit_transform_scale(X_tr, None, X_ev)

    # Fit & predict here when ready:
    # model.fit(X_tr_s, y_tr)
    # y_hat = model.predict(X_ev_s)
    # all_preds.append(y_hat); all_truth.append(y_ev)

print("✅ Walk-forward loop ran with NaN-safe slicing and window guards.")


✅ Walk-forward loop ran with NaN-safe slicing and window guards.
