# 📈 Overnight Equities Volume Forecasting (European Symbols)
This notebook demonstrates how to build a LightGBM model to forecast the full-day volume of European equity symbols using overnight data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import random
from datetime import datetime, timedelta


## 🔧 Step 1: Generate Mock Data

In [None]:
np.random.seed(42)
random.seed(42)

symbols = [f"SYM{i}" for i in range(10)]
dates = pd.bdate_range(end=datetime.today(), periods=200)

data = []
for symbol in symbols:
    volume = 1e6 + np.cumsum(np.random.normal(0, 1e5, len(dates)))
    close = 100 + np.cumsum(np.random.normal(0, 1, len(dates)))
    for i in range(1, len(dates)):
        data.append({
            "date": dates[i],
            "symbol": symbol,
            "prev_close": close[i-1],
            "prev_volume": volume[i-1],
            "return": (close[i] - close[i-1]) / close[i-1],
            "vix_change": np.random.normal(0, 0.02),
            "stoxx_fut_chg": np.random.normal(0, 0.01),
            "dow_chg": np.random.normal(0, 0.01),
            "fx_eurusd_chg": np.random.normal(0, 0.005),
            "dow_vol_adr": np.random.uniform(0.5, 2.0),
            "is_month_end": int(dates[i].is_month_end),
            "next_day_volume": volume[i]
        })

df = pd.DataFrame(data)


## 🧪 Step 2: Feature Engineering

In [None]:
df['log_prev_volume'] = np.log1p(df['prev_volume'])
df['log_next_day_volume'] = np.log1p(df['next_day_volume'])
df['symbol_cat'] = df['symbol'].astype('category').cat.codes
df.sort_values(by=['date', 'symbol'], inplace=True)


## 🎯 Step 3: Train LightGBM Model with TimeSeries Split

In [None]:
features = [
    'symbol_cat', 'prev_close', 'log_prev_volume', 'return',
    'vix_change', 'stoxx_fut_chg', 'dow_chg',
    'fx_eurusd_chg', 'dow_vol_adr', 'is_month_end'
]
target = 'log_next_day_volume'

X = df[features]
y = df[target]

tscv = TimeSeriesSplit(n_splits=5)
results = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params={
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'verbose': -1,
            'seed': 42
        },
        train_set=train_data,
        valid_sets=[val_data],
        early_stopping_rounds=10,
        verbose_eval=False
    )

    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    results.append((fold + 1, rmse))

results_df = pd.DataFrame(results, columns=["Fold", "RMSE"])
results_df


## 📊 Step 4: Feature Importance

In [None]:
lgb.plot_importance(model)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()
