# SP500 / NASDAQ Rolling Forecast Notebook

Paired notebook design for side-by-side TensorFlow vs PyTorch comparison.

Key rules applied:
- Plus1 rolling notation (`N - W + 1`).
- Refit per window.
- CSV-first logging.
- Tuning selection by **MAPE** (R2 is secondary diagnostics).

## Window Notation (Plus1)

For each `W`:
- `TEST_LEN = max(1, round(W * test_ratio))`
- `win   = df[start : start+W]`
- `train = win[:W-TEST_LEN]`
- `test  = win[W-TEST_LEN:]`

Loop:
- `for start in range(0, N - W + 1, STEP)`

In [None]:
# If needed:
# %pip install yfinance pandas numpy scikit-learn matplotlib seaborn tqdm

In [None]:
import ast
import json
import random
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, r2_score

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

FRAMEWORK_NAME = 'tensorflow'

# ----------------------------
# Dates and data scope
# ----------------------------
START_DATE = '2024-01-01'
END_DATE = '2025-12-31'
TUNING_END_DATE = '2024-04-30'
BUFFER_MONTHS = 6  # extra history for stable technical indicators

# ----------------------------
# Rolling settings
# ----------------------------
WINDOW_SIZES = [10, 20]
TEST_RATIO_BY_WINDOW = {10: 0.10, 20: 0.05}
STEP = 1

# ----------------------------
# Smoke test controls
# ----------------------------
SMOKE_TEST = True
SMOKE_MAX_CONFIGS = 1
SMOKE_MAX_WINDOWS = 2
SMOKE_EPOCH = 2

MAX_CONFIGS_PER_MODEL = None  # None means full grid

# ----------------------------
# Logging
# ----------------------------
LOG_BASE = Path('dl_logs_index') / FRAMEWORK_NAME
LOG_BASE.mkdir(parents=True, exist_ok=True)

RAW_CACHE_DIR = Path('data_cache') / 'index_ohlcv'
RAW_CACHE_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = 'y'
DATASETS = ['SP500', 'NASDAQ']

print('Framework:', FRAMEWORK_NAME)
print('Log base: ', LOG_BASE.resolve())
print('Smoke test enabled:', SMOKE_TEST)

In [None]:
INDEX_META = {
    'SP500': {'symbol': '^GSPC', 'prefix': 'sp500', 'alias': 'SP500'},
    'NASDAQ': {'symbol': '^IXIC', 'prefix': 'nasdaq', 'alias': 'NASDAQ'},
}


def _cache_path(alias: str, start: str, end: str) -> Path:
    safe = str(alias).strip().upper()
    return RAW_CACHE_DIR / f'{safe}_{start}_{end}.csv'


def _is_valid_ohlcv_df(df: pd.DataFrame) -> bool:
    if df is None or df.empty:
        return False
    need = ['Open', 'High', 'Low', 'Close', 'Volume']
    if any(c not in df.columns for c in need):
        return False
    if not isinstance(df.index, pd.DatetimeIndex):
        return False
    if df.index.isna().all():
        return False
    return True


def _read_cache(cache_path: Path) -> pd.DataFrame | None:
    try:
        df = pd.read_csv(cache_path, index_col=0, parse_dates=True)
        df.index.name = None
        if _is_valid_ohlcv_df(df):
            return df
    except Exception:
        pass
    return None


def _download_ohlcv(symbol: str, start: str, end: str, alias: str) -> pd.DataFrame:
    cache_path = _cache_path(alias, start, end)

    if cache_path.exists():
        cached = _read_cache(cache_path)
        if cached is not None:
            print(f'[CACHE] load {alias} <- {cache_path}')
            return cached[['Open', 'High', 'Low', 'Close', 'Volume']].dropna().sort_index()

        print(f'[CACHE] invalid/empty cache for {alias}; redownloading and replacing {cache_path}')
        try:
            cache_path.unlink()
        except Exception:
            pass

    print(f'[CACHE] miss {alias}; downloading...')
    df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=False)
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [c[0] for c in df.columns]

    if not _is_valid_ohlcv_df(df):
        raise ValueError(f'Downloaded data for {symbol} is empty/invalid for {start}..{end}')

    df = df[['Open', 'High', 'Low', 'Close', 'Volume']].dropna().sort_index()
    if df.empty:
        raise ValueError(f'Downloaded data for {symbol} became empty after cleanup ({start}..{end})')

    df.to_csv(cache_path)
    print(f'[CACHE] save {alias} -> {cache_path}')
    return df


def _rsi(close: pd.Series, period: int = 14) -> pd.Series:
    d = close.diff()
    up = d.clip(lower=0)
    dn = -d.clip(upper=0)
    ma_up = up.ewm(alpha=1 / period, adjust=False).mean()
    ma_dn = dn.ewm(alpha=1 / period, adjust=False).mean()
    rs = ma_up / ma_dn.replace(0, np.nan)
    return 100 - (100 / (1 + rs))


def build_single_index_df(index_key: str) -> pd.DataFrame:
    meta = INDEX_META[index_key]
    symbol = meta['symbol']
    prefix = meta['prefix']
    alias = meta['alias']

    # buffer window for technical computation
    buffered_start = (pd.Timestamp(START_DATE) - pd.DateOffset(months=BUFFER_MONTHS)).strftime('%Y-%m-%d')
    raw = _download_ohlcv(symbol, buffered_start, END_DATE, alias)

    df = pd.DataFrame(index=raw.index)
    close = raw['Close']

    # Five technical indicators based solely on this index
    df[f'{prefix}_rsi14'] = _rsi(close, 14)

    ema12 = close.ewm(span=12, adjust=False).mean()
    ema26 = close.ewm(span=26, adjust=False).mean()
    macd = ema12 - ema26
    sig = macd.ewm(span=9, adjust=False).mean()
    df[f'{prefix}_macd_hist'] = macd - sig

    ma20 = close.rolling(20).mean()
    sd20 = close.rolling(20).std()
    df[f'{prefix}_bb_z20'] = (close - ma20) / sd20.replace(0, np.nan)

    df[f'{prefix}_roc10'] = close.pct_change(10)
    df[f'{prefix}_vol20'] = close.pct_change().rolling(20).std()

    # target: next-day same-index close
    df[TARGET_COL] = close.shift(-1)

    # trim to pipeline horizon after techs are computed
    df = df.loc[START_DATE:END_DATE].dropna().copy()
    return df


def feature_cols_for(index_key: str) -> List[str]:
    p = INDEX_META[index_key]['prefix']
    return [f'{p}_rsi14', f'{p}_macd_hist', f'{p}_bb_z20', f'{p}_roc10', f'{p}_vol20']


In [None]:
def iter_windows_plus1(df: pd.DataFrame, window_size: int, test_ratio: float, step: int = 1):
    test_len = max(1, int(round(window_size * test_ratio)))
    n = len(df)
    for start in range(0, n - window_size + 1, step):
        win = df.iloc[start:start + window_size]
        yield {
            'start': start,
            'end_exclusive': start + window_size,
            'test_len': test_len,
            'train_df': win.iloc[:window_size - test_len],
            'test_df': win.iloc[window_size - test_len:],
        }


def scale_xy_window(train_df: pd.DataFrame, test_df: pd.DataFrame, feature_cols: List[str], target_col: str):
    X_tr = train_df[feature_cols].to_numpy(dtype=np.float32)
    X_te = test_df[feature_cols].to_numpy(dtype=np.float32)
    y_tr = train_df[target_col].to_numpy(dtype=np.float32)
    y_te = test_df[target_col].to_numpy(dtype=np.float32)

    sx = StandardScaler().fit(X_tr)
    X_tr_s = sx.transform(X_tr).astype(np.float32)
    X_te_s = sx.transform(X_te).astype(np.float32)

    sy = StandardScaler().fit(y_tr.reshape(-1, 1))
    y_tr_s = sy.transform(y_tr.reshape(-1, 1)).ravel().astype(np.float32)
    y_te_s = sy.transform(y_te.reshape(-1, 1)).ravel().astype(np.float32)

    return X_tr_s, X_te_s, y_tr_s, y_te_s, sy


def parse_list_like(x: Any) -> List[float]:
    if isinstance(x, list):
        return [float(v) for v in x]
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    s = str(x)
    for fn in (ast.literal_eval, json.loads):
        try:
            v = fn(s)
            if isinstance(v, list):
                return [float(z) for z in v]
            return [float(v)]
        except Exception:
            pass
    return []


def mape_r2(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    yt = y_true[mask]
    yp = y_pred[mask]
    if len(yt) == 0:
        return {'MAPE': np.nan, 'R2': np.nan, 'n_points': 0}
    mape = float(mean_absolute_percentage_error(yt, yp) * 100.0)
    r2 = float(r2_score(yt, yp)) if len(yt) > 1 else np.nan
    return {'MAPE': mape, 'R2': r2, 'n_points': int(len(yt))}

In [None]:
# Requested grid
BASE_GRID = {
    'hidden_dim': [32, 64],
    'num_layers': [2, 3],
    'dropout': [0.0, 0.2],
    'learning_rate': [1e-3, 1e-4],
    'batch_size': [32, 64],
    'epoch': [32, 64],
    'activation': ['relu', 'tanh'],
    'tcn_dilation_rate': [2, 4],
    'transformer_attention heads': [2, 4],
}


def expand_grid(d: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
    keys = list(d.keys())
    vals = [d[k] for k in keys]
    out: List[Dict[str, Any]] = []

    def rec(i: int, cur: Dict[str, Any]):
        if i == len(keys):
            out.append(dict(cur))
            return
        k = keys[i]
        for v in vals[i]:
            cur[k] = v
            rec(i + 1, cur)

    rec(0, {})
    return out


CORE_KEYS = ['hidden_dim', 'num_layers', 'dropout', 'learning_rate', 'batch_size', 'epoch', 'activation']
TCN_KEYS = CORE_KEYS + ['tcn_dilation_rate']
TRANS_KEYS = CORE_KEYS + ['transformer_attention heads']

MLP_GRID = expand_grid({k: BASE_GRID[k] for k in CORE_KEYS})
RNN_GRID = expand_grid({k: BASE_GRID[k] for k in CORE_KEYS})
LSTM_GRID = expand_grid({k: BASE_GRID[k] for k in CORE_KEYS})
GRU_GRID = expand_grid({k: BASE_GRID[k] for k in CORE_KEYS})
TCN_GRID = expand_grid({k: BASE_GRID[k] for k in TCN_KEYS})
TRANSFORMER_GRID = expand_grid({k: BASE_GRID[k] for k in TRANS_KEYS})

print('Grid sizes -> MLP/RNN/LSTM/GRU:', len(MLP_GRID), 'TCN:', len(TCN_GRID), 'Transformer:', len(TRANSFORMER_GRID))

## Framework-Specific Cell (Different Between Notebooks)

The next code cell is the only implementation-level difference (training backend).

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.random.set_seed(SEED)

# Device priority: CUDA -> MPS (Metal) -> CPU
_tf_gpus = tf.config.list_physical_devices('GPU')
if _tf_gpus:
    TF_DEVICE = '/GPU:0'
    TF_DEVICE_NAME = _tf_gpus[0].name
else:
    TF_DEVICE = '/CPU:0'
    TF_DEVICE_NAME = 'CPU'
print(f'[TF] Device selected: {TF_DEVICE} ({TF_DEVICE_NAME})')


def build_tf_model(model_name: str, n_features: int, hp: Dict[str, Any]):
    h = int(hp['hidden_dim'])
    n = int(hp['num_layers'])
    d = float(hp['dropout'])
    act = hp['activation']
    tcn_d = int(hp.get('tcn_dilation_rate', 2))
    n_heads = int(hp.get('transformer_attention heads', 2))

    x_in = keras.Input(shape=(n_features,))

    if model_name == 'MLP':
        x = x_in
        for _ in range(n):
            x = layers.Dense(h, activation=act)(x)
            x = layers.Dropout(d)(x)

    elif model_name in {'RNN', 'LSTM', 'GRU'}:
        x = layers.Reshape((1, n_features))(x_in)
        for i in range(n):
            rs = i < (n - 1)
            if model_name == 'RNN':
                x = layers.SimpleRNN(h, activation=act, return_sequences=rs)(x)
            elif model_name == 'LSTM':
                x = layers.LSTM(h, activation=act, return_sequences=rs)(x)
            else:
                x = layers.GRU(h, activation=act, return_sequences=rs)(x)
            x = layers.Dropout(d)(x)

    elif model_name == 'TCN':
        x = layers.Reshape((1, n_features))(x_in)
        for _ in range(n):
            x = layers.Conv1D(h, kernel_size=2, padding='causal', dilation_rate=tcn_d, activation=act)(x)
            x = layers.Dropout(d)(x)
        x = layers.GlobalAveragePooling1D()(x)

    elif model_name == 'Transformer':
        x = layers.Reshape((1, n_features))(x_in)
        d_model = max(h, n_heads)
        x = layers.Dense(d_model)(x)
        for _ in range(n):
            a = layers.MultiHeadAttention(num_heads=n_heads, key_dim=max(1, d_model // n_heads), dropout=d)(x, x)
            x = layers.LayerNormalization()(x + a)
            ff = layers.Dense(h, activation=act)(x)
            ff = layers.Dropout(d)(ff)
            ff = layers.Dense(d_model)(ff)
            x = layers.LayerNormalization()(x + ff)
        x = layers.GlobalAveragePooling1D()(x)

    else:
        raise ValueError(model_name)

    y = layers.Dense(1)(x)
    m = keras.Model(x_in, y)
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=float(hp['learning_rate'])), loss='mse')
    return m


def fit_predict_window_tf(model_name: str, hp: Dict[str, Any], X_tr_s, y_tr_s, X_te_s):
    hp_run = dict(hp)
    if SMOKE_TEST:
        hp_run['epoch'] = min(int(hp_run['epoch']), int(SMOKE_EPOCH))

    with tf.device(TF_DEVICE):
        m = build_tf_model(model_name, X_tr_s.shape[1], hp_run)
        m.fit(X_tr_s, y_tr_s, epochs=int(hp_run['epoch']), batch_size=int(hp_run['batch_size']), verbose=0)
        yp = m.predict(X_te_s, verbose=0).reshape(-1)
    return yp

print('Defined: fit_predict_window_tf')


In [None]:
# ----------------------------
# Rolling run + logging
# ----------------------------

def _log_path(dataset_tag: str, model_name: str, w: int) -> Path:
    d = LOG_BASE / dataset_tag / f'w{int(w)}' / model_name.upper()
    d.mkdir(parents=True, exist_ok=True)
    return d / f'{model_name.upper()}_{dataset_tag}.csv'


def run_grid_and_log(
    *,
    dataset_tag: str,
    df: pd.DataFrame,
    feature_cols: List[str],
    model_name: str,
    model_grid: List[Dict[str, Any]],
    window_size: int,
    test_ratio: float,
    tuning_end_date: str,
    overwrite: bool = False,
) -> pd.DataFrame:
    log_path = _log_path(dataset_tag, model_name, window_size)

    grid = model_grid
    if MAX_CONFIGS_PER_MODEL is not None:
        grid = grid[:MAX_CONFIGS_PER_MODEL]
    if SMOKE_TEST:
        grid = grid[:SMOKE_MAX_CONFIGS]

    if overwrite and log_path.exists():
        log_path.unlink()

    rows = []
    tune_cut = pd.Timestamp(tuning_end_date)

    for cfg_idx, hp in enumerate(tqdm(grid, desc=f'{dataset_tag}-{model_name}-w{window_size}', unit='cfg')):
        w_iter = iter_windows_plus1(df, window_size, test_ratio, STEP)
        for win_id, win in enumerate(w_iter, start=1):
            if SMOKE_TEST and win_id > SMOKE_MAX_WINDOWS:
                break

            tr_df = win['train_df']
            te_df = win['test_df']
            if tr_df.empty or te_df.empty:
                continue

            X_tr_s, X_te_s, y_tr_s, y_te_s, sy = scale_xy_window(tr_df, te_df, feature_cols, TARGET_COL)

            if FRAMEWORK_NAME == 'tensorflow':
                y_pred_s = fit_predict_window_tf(model_name, hp, X_tr_s, y_tr_s, X_te_s)
            else:
                y_pred_s = fit_predict_window_torch(model_name, hp, X_tr_s, y_tr_s, X_te_s)

            y_pred = sy.inverse_transform(np.asarray(y_pred_s).reshape(-1, 1)).ravel()
            y_true = sy.inverse_transform(np.asarray(y_te_s).reshape(-1, 1)).ravel()

            rows.append({
                'framework': FRAMEWORK_NAME,
                'dataset_tag': dataset_tag,
                'model_name': model_name.upper(),
                'config_index': int(cfg_idx),
                'model_hyperparameters_dict': json.dumps(hp, sort_keys=True),
                'window_id': int(win_id),
                'window_size': int(window_size),
                'test_ratio': float(test_ratio),
                'test_len': int(win['test_len']),
                'start_date': str(tr_df.index[0].date()),
                'end_date': str(te_df.index[-1].date()),
                'is_tuning_window': bool(pd.Timestamp(te_df.index[-1]) <= tune_cut),
                'test_data_values_list': json.dumps([float(v) for v in y_true.tolist()]),
                'test_data_model_predictions_list': json.dumps([float(v) for v in y_pred.tolist()]),
            })

    out = pd.DataFrame(rows)
    if not out.empty:
        if log_path.exists() and not overwrite:
            old = pd.read_csv(log_path)
            out = pd.concat([old, out], ignore_index=True)
        out.to_csv(log_path, index=False)

    print(f'[INFO] {dataset_tag} {model_name} w{window_size} rows={len(rows)} -> {log_path}')
    return out

In [None]:
# Build datasets independently (SP500-only and NASDAQ-only)
DATA_BY_INDEX = {k: build_single_index_df(k) for k in DATASETS}
FEATURES_BY_INDEX = {k: feature_cols_for(k) for k in DATASETS}

for k in DATASETS:
    d = DATA_BY_INDEX[k]
    f = FEATURES_BY_INDEX[k]

    if d.empty or not isinstance(d.index, pd.DatetimeIndex):
        print(f'{k}: rows=0, features={f}, range=<empty>')
        continue

    idx_min = d.index.min()
    idx_max = d.index.max()
    left = idx_min.date() if pd.notna(idx_min) else '<na>'
    right = idx_max.date() if pd.notna(idx_max) else '<na>'
    print(f'{k}: rows={len(d)}, features={f}, range={left} -> {right}')

    for W in WINDOW_SIZES:
        ratio = TEST_RATIO_BY_WINDOW[W]
        test_len = max(1, int(round(W * ratio)))
        n_windows = max(0, (len(d) - W) // STEP + 1)
        print(f'  [PLUS1] W={W}, ratio={ratio:.4f}, TEST_LEN={test_len}, windows={n_windows}, loop=range(0, N-{W}+1, {STEP})')


## Optional Smoke Test Cell

This quick check runs one model/config/window branch so you can verify end-to-end logging before full sweeps.

In [None]:
if SMOKE_TEST:
    _ = run_grid_and_log(
        dataset_tag='SP500',
        df=DATA_BY_INDEX['SP500'],
        feature_cols=FEATURES_BY_INDEX['SP500'],
        model_name='MLP',
        model_grid=MLP_GRID,
        window_size=10,
        test_ratio=TEST_RATIO_BY_WINDOW[10],
        tuning_end_date=TUNING_END_DATE,
        overwrite=True,
    )
else:
    print('SMOKE_TEST is False; skipping smoke run.')

## Model Runs (Separated)

No outer `for model in MODELS` loop; each model has its own cell.

In [None]:
# MLP
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='MLP',
            model_grid=MLP_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# RNN
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='RNN',
            model_grid=RNN_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# LSTM
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='LSTM',
            model_grid=LSTM_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# GRU
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='GRU',
            model_grid=GRU_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# TCN
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='TCN',
            model_grid=TCN_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# Transformer
for ds in DATASETS:
    for W in WINDOW_SIZES:
        run_grid_and_log(
            dataset_tag=ds,
            df=DATA_BY_INDEX[ds],
            feature_cols=FEATURES_BY_INDEX[ds],
            model_name='Transformer',
            model_grid=TRANSFORMER_GRID,
            window_size=W,
            test_ratio=TEST_RATIO_BY_WINDOW[W],
            tuning_end_date=TUNING_END_DATE,
            overwrite=True,
        )

In [None]:
# Score logs by (dataset_tag, model, window_size, config_index)

def collect_log_files(base: Path) -> List[Path]:
    return sorted([p for p in base.glob('*/*/*/*.csv') if p.is_file()])


def score_one_log(path: Path) -> pd.DataFrame:
    d = pd.read_csv(path)
    if d.empty:
        return pd.DataFrame()

    out = []
    gobj = d.groupby(['dataset_tag', 'model_name', 'window_size', 'config_index'], dropna=True)
    for (dataset_tag, model_name, w, cfg_idx), g in gobj:
        g = g.sort_values('window_id', kind='stable')

        yt_tune, yp_tune = [], []
        yt_eval, yp_eval = [], []
        hp = g['model_hyperparameters_dict'].dropna().iloc[0] if 'model_hyperparameters_dict' in g.columns else '{}'

        for _, r in g.iterrows():
            yt = parse_list_like(r.get('test_data_values_list'))
            yp = parse_list_like(r.get('test_data_model_predictions_list'))
            if not yt or not yp:
                continue
            if bool(r.get('is_tuning_window')):
                yt_tune.extend(yt)
                yp_tune.extend(yp)
            else:
                yt_eval.extend(yt)
                yp_eval.extend(yp)

        m_tune = mape_r2(np.asarray(yt_tune, dtype=float), np.asarray(yp_tune, dtype=float))
        m_eval = mape_r2(np.asarray(yt_eval, dtype=float), np.asarray(yp_eval, dtype=float))

        out.append({
            'framework': FRAMEWORK_NAME,
            'dataset_tag': dataset_tag,
            'model': str(model_name).upper(),
            'window_size': int(w),
            'config_index': int(cfg_idx),
            'hyperparams_json': hp,
            'tune_MAPE': m_tune['MAPE'],
            'tune_R2': m_tune['R2'],
            'tune_n': m_tune['n_points'],
            'eval_MAPE': m_eval['MAPE'],
            'eval_R2': m_eval['R2'],
            'eval_n': m_eval['n_points'],
        })

    return pd.DataFrame(out)

parts = []
for fp in collect_log_files(LOG_BASE):
    s = score_one_log(fp)
    if not s.empty:
        parts.append(s)

scores_df = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
print('Score rows:', len(scores_df))
if not scores_df.empty:
    display(scores_df.head(20))

In [None]:
# Select best config by MAPE (default criterion)
if scores_df.empty:
    print('No scores found yet.')
else:
    best_rows = []
    for (ds, model, w), g in scores_df.groupby(['dataset_tag', 'model', 'window_size'], as_index=False):
        pick = g.sort_values(['tune_MAPE', 'tune_R2', 'config_index'], ascending=[True, False, True], kind='stable').iloc[0]
        best_rows.append(pick.to_dict())

    best_df = pd.DataFrame(best_rows).sort_values(['dataset_tag', 'window_size', 'model'], kind='stable')
    display(best_df[['framework', 'dataset_tag', 'model', 'window_size', 'config_index', 'tune_MAPE', 'tune_R2', 'eval_MAPE', 'eval_R2', 'hyperparams_json']])

    out_scores = Path('results') / f'index_config_scores_{FRAMEWORK_NAME}.csv'
    out_best = Path('results') / f'index_best_configs_{FRAMEWORK_NAME}.csv'
    out_scores.parent.mkdir(parents=True, exist_ok=True)
    scores_df.to_csv(out_scores, index=False)
    best_df.to_csv(out_best, index=False)
    print('[INFO] wrote', out_scores)
    print('[INFO] wrote', out_best)

In [None]:
# Quick visual summary
if scores_df.empty:
    print('No scores available.')
else:
    sns.set_theme(style='whitegrid')

    fig, axes = plt.subplots(1, 2, figsize=(14, 5), constrained_layout=True)
    bdf = scores_df.copy()
    bdf['label'] = bdf['dataset_tag'] + '-' + bdf['model'] + '-w' + bdf['window_size'].astype(str)

    m1 = bdf.sort_values('eval_MAPE', kind='stable')
    axes[0].barh(m1['label'], m1['eval_MAPE'])
    axes[0].set_title(f'{FRAMEWORK_NAME.upper()} Eval MAPE')
    axes[0].set_xlabel('MAPE (%)')

    m2 = bdf.sort_values('eval_R2', kind='stable')
    axes[1].barh(m2['label'], m2['eval_R2'])
    axes[1].set_title(f'{FRAMEWORK_NAME.upper()} Eval R2')
    axes[1].set_xlabel('R2')

    plt.show()

## Visuals

This section generates paper-ready figures and saves them under `results/figures_<framework>/`.


In [None]:
# Comprehensive visuals for reporting
if scores_df.empty:
    print('No scores available.')
else:
    import matplotlib.ticker as mtick

    fig_dir = Path('results') / f'figures_{FRAMEWORK_NAME}'
    fig_dir.mkdir(parents=True, exist_ok=True)

    plot_df = scores_df.copy()
    for c in ['tune_MAPE', 'tune_R2', 'eval_MAPE', 'eval_R2', 'window_size', 'config_index']:
        plot_df[c] = pd.to_numeric(plot_df[c], errors='coerce')

    # Best-by-tuning-MAPE table (default selection criterion)
    best_rows = []
    for (ds, model, w), g in plot_df.groupby(['dataset_tag', 'model', 'window_size'], as_index=False):
        g = g.dropna(subset=['tune_MAPE'])
        if g.empty:
            continue
        pick = g.sort_values(['tune_MAPE', 'tune_R2', 'config_index'], ascending=[True, False, True], kind='stable').iloc[0]
        best_rows.append(pick.to_dict())
    best_plot_df = pd.DataFrame(best_rows)

    if best_plot_df.empty:
        print('No valid rows after filtering for visuals.')
    else:
        sns.set_theme(style='whitegrid', context='talk')

        # 1) Heatmap: eval MAPE for best tuned config per model/dataset/window
        for ds, g in best_plot_df.groupby('dataset_tag'):
            pv = g.pivot_table(index='model', columns='window_size', values='eval_MAPE', aggfunc='mean')
            plt.figure(figsize=(7, 5))
            ax = sns.heatmap(pv, annot=True, fmt='.3f', cmap='YlGnBu', cbar_kws={'label': 'Eval MAPE (%)'})
            ax.set_title(f'{FRAMEWORK_NAME.upper()} {ds}: Eval MAPE Heatmap (best by tune MAPE)')
            ax.set_xlabel('Window Size')
            ax.set_ylabel('Model')
            plt.tight_layout()
            out = fig_dir / f'heatmap_eval_mape_{ds.lower()}.png'
            plt.savefig(out, dpi=300)
            plt.show()

        # 2) Grouped bars: eval MAPE by model/window for each dataset
        for ds, g in best_plot_df.groupby('dataset_tag'):
            plt.figure(figsize=(11, 5))
            ax = sns.barplot(data=g, x='model', y='eval_MAPE', hue='window_size', palette='Set2')
            ax.set_title(f'{FRAMEWORK_NAME.upper()} {ds}: Eval MAPE by Model and Window')
            ax.set_xlabel('Model')
            ax.set_ylabel('Eval MAPE (%)')
            ax.legend(title='Window')
            plt.xticks(rotation=20)
            plt.tight_layout()
            out = fig_dir / f'bar_eval_mape_{ds.lower()}.png'
            plt.savefig(out, dpi=300)
            plt.show()

        # 3) Tuning-vs-eval generalization scatter
        plt.figure(figsize=(9, 6))
        ax = sns.scatterplot(
            data=plot_df,
            x='tune_MAPE',
            y='eval_MAPE',
            hue='model',
            style='dataset_tag',
            size='window_size',
            sizes=(40, 180),
            alpha=0.75,
        )
        ax.set_title(f'{FRAMEWORK_NAME.upper()}: Tune vs Eval MAPE (all configs)')
        ax.set_xlabel('Tune MAPE (%)')
        ax.set_ylabel('Eval MAPE (%)')
        ax.grid(alpha=0.3)
        plt.tight_layout()
        out = fig_dir / 'scatter_tune_vs_eval_mape.png'
        plt.savefig(out, dpi=300)
        plt.show()

        # 4) Frontier: Eval MAPE vs Eval R2 on best configs
        plt.figure(figsize=(9, 6))
        ax = sns.scatterplot(
            data=best_plot_df,
            x='eval_MAPE',
            y='eval_R2',
            hue='model',
            style='dataset_tag',
            size='window_size',
            sizes=(80, 220),
            alpha=0.85,
        )
        for _, r in best_plot_df.iterrows():
            ax.text(r['eval_MAPE'], r['eval_R2'], f"{r['dataset_tag']}-w{int(r['window_size'])}", fontsize=8, alpha=0.75)
        ax.set_title(f'{FRAMEWORK_NAME.upper()}: Eval MAPE vs Eval R2 Frontier (best tuned configs)')
        ax.set_xlabel('Eval MAPE (%)')
        ax.set_ylabel('Eval R2')
        ax.grid(alpha=0.3)
        plt.tight_layout()
        out = fig_dir / 'frontier_eval_mape_vs_r2_best.png'
        plt.savefig(out, dpi=300)
        plt.show()

        # Save tables used for paper
        best_out = fig_dir / 'best_plot_table.csv'
        full_out = fig_dir / 'all_scores_table.csv'
        best_plot_df.to_csv(best_out, index=False)
        plot_df.to_csv(full_out, index=False)
        print('[INFO] Saved figures and tables to', fig_dir)
