# 08 - Hyperparameter Tuning with Optuna

This notebook uses Optuna to optimize XGBoost hyperparameters for the volume forecasting task.

## Optimization Strategy

- **Method**: Bayesian optimization with TPE (Tree-structured Parzen Estimator)
- **Objective**: Minimize mean MAE across walk-forward validation folds
- **Trials**: 50 trials with median pruning
- **Parameters**: n_estimators, max_depth, learning_rate, min_child_weight

## 1. Setup and Imports

In [None]:
import sys
import json
import warnings
from pathlib import Path
from datetime import date

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Add src to path for imports
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Import evaluation tools
from volume_forecast.evaluation import WalkForwardValidator

# Import models
from volume_forecast.models import XGBoostModel

# Import feature pipeline
from volume_forecast.features import FeaturePipeline

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print(f"Optuna version: {optuna.__version__}")
print("Setup complete!")

## 2. Load and Prepare Data

In [None]:
# Load data
data_path = project_root / "data" / "raw" / "synthetic_volumes.csv"
df = pd.read_csv(data_path, parse_dates=["date"])
df = df.sort_values('date').reset_index(drop=True)

print(f"Loaded data: {len(df)} rows")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")

In [None]:
# Apply Feature Pipeline
pipeline = FeaturePipeline(
    date_column='date',
    target_columns=['daily_logins'],
    include_events=True,
    include_football=True,
)
df_features = pipeline.fit_transform(df)
print(f"Features generated: {len(pipeline.get_feature_names())} columns")
print(f"Dataset shape: {df_features.shape}")

In [None]:
# Define constants
TARGET = 'daily_logins'
DATE_COLUMN = 'date'

# External features for enhanced model (same as notebook 07)
EXTERNAL_FEATURES = [
    # Temporal features
    'day_of_week', 'is_weekend', 'day_of_week_sin', 'day_of_week_cos',
    'month', 'month_sin', 'month_cos',
    
    # Rolling statistics
    'daily_logins_rolling_mean_7', 'daily_logins_rolling_mean_14', 'daily_logins_rolling_mean_30',
    'daily_logins_rolling_std_7', 'daily_logins_rolling_std_14', 'daily_logins_rolling_std_30',
    
    # Event flags (same-day)
    'is_bank_holiday', 'is_racing_event', 'is_tennis_event',
    'is_boxing_event', 'is_football_match', 'event_importance',
    
    # Lead indicators (upcoming events)
    'any_event_tomorrow', 'any_event_in_2_days', 'any_event_in_3_days',
    'bank_holiday_tomorrow', 'bank_holiday_in_2_days', 'bank_holiday_in_3_days',
    'football_tomorrow', 'football_in_2_days', 'football_in_3_days',
    
    # Lag indicators (past events)
    'any_event_yesterday', 'any_event_2_days_ago',
    'bank_holiday_yesterday', 'bank_holiday_2_days_ago',
    'football_yesterday', 'football_2_days_ago',
]

print(f"External features: {len(EXTERNAL_FEATURES)}")

## 3. Define Objective Function

In [None]:
# Create validator (same settings as notebook 07)
validator = WalkForwardValidator(
    min_train_size=365,
    test_size=7,
    step_size=7
)

n_folds = validator.get_n_splits(df_features, date_column=DATE_COLUMN)
print(f"Walk-forward validation: {n_folds} folds")

In [None]:
def objective(trial: optuna.Trial) -> float:
    """Optuna objective function for XGBoost hyperparameter optimization.
    
    Returns mean MAE across walk-forward validation folds.
    """
    # Sample hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    
    # Create model with sampled parameters
    model = XGBoostModel(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        lags=[1, 7, 14],
        external_features=EXTERNAL_FEATURES,
        name='XGBoost_Tuning'
    )
    
    # Run walk-forward validation
    fold_results = validator.validate(
        model=model,
        df=df_features,
        target=TARGET,
        date_column=DATE_COLUMN,
        feature_columns=EXTERNAL_FEATURES + [DATE_COLUMN]
    )
    
    # Calculate mean MAE
    mae_values = [r['metrics']['mae'] for r in fold_results]
    mean_mae = np.mean(mae_values)
    
    return mean_mae

## 4. Run Optimization

In [None]:
# Create Optuna study
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_startup_trials=10)
)

print("Starting hyperparameter optimization...")
print(f"Trials: 50")
print(f"Folds per trial: {n_folds}")
print("="*60)

In [None]:
# Run optimization with progress callback
def callback(study: optuna.Study, trial: optuna.trial.FrozenTrial):
    if trial.number % 5 == 0 or trial.number == 0:
        print(f"Trial {trial.number:3d}: MAE = {trial.value:.2f} | Best so far: {study.best_value:.2f}")

study.optimize(
    objective,
    n_trials=50,
    callbacks=[callback],
    show_progress_bar=True
)

print("="*60)
print("Optimization complete!")

## 5. Analyze Results

In [None]:
# Display best parameters
print("="*60)
print("BEST HYPERPARAMETERS")
print("="*60)
print(f"\nBest MAE: {study.best_value:.2f}")
print(f"\nBest Parameters:")
for param, value in study.best_params.items():
    print(f"  {param}: {value}")

In [None]:
# Plot optimization history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Optimization history
ax1 = axes[0]
trials = [t.number for t in study.trials]
values = [t.value for t in study.trials]
best_values = [min(values[:i+1]) for i in range(len(values))]

ax1.scatter(trials, values, alpha=0.5, label='Trial MAE')
ax1.plot(trials, best_values, 'r-', linewidth=2, label='Best MAE')
ax1.set_xlabel('Trial')
ax1.set_ylabel('MAE')
ax1.set_title('Optimization History', fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Parameter importance
ax2 = axes[1]
importances = optuna.importance.get_param_importances(study)
params = list(importances.keys())
values = list(importances.values())

ax2.barh(params, values, color='steelblue')
ax2.set_xlabel('Importance')
ax2.set_title('Parameter Importance', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
# Show top 10 trials
print("\nTop 10 Trials:")
print("-"*80)

trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values('value').head(10)
display_cols = ['number', 'value', 'params_n_estimators', 'params_max_depth', 
                'params_learning_rate', 'params_min_child_weight']
trials_df[display_cols].round(4)

## 6. Validate Best Model

In [None]:
# Create tuned model with best parameters
best_params = study.best_params

tuned_model = XGBoostModel(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    lags=[1, 7, 14],
    external_features=EXTERNAL_FEATURES,
    name='XGBoost_Tuned'
)

# Create default model for comparison
default_model = XGBoostModel(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    lags=[1, 7, 14],
    external_features=EXTERNAL_FEATURES,
    name='XGBoost_Default'
)

print("Models created for comparison")

In [None]:
# Run full validation on both models
print("Running full validation...")

results = {}
for model in [default_model, tuned_model]:
    fold_results = validator.validate(
        model=model,
        df=df_features,
        target=TARGET,
        date_column=DATE_COLUMN,
        feature_columns=EXTERNAL_FEATURES + [DATE_COLUMN]
    )
    
    mae_values = [r['metrics']['mae'] for r in fold_results]
    mape_values = [r['metrics']['mape'] for r in fold_results]
    
    results[model.name] = {
        'mae_mean': np.mean(mae_values),
        'mae_std': np.std(mae_values),
        'mape_mean': np.mean(mape_values),
        'mape_std': np.std(mape_values),
    }

print("Validation complete!")

In [None]:
# Display comparison
print("="*70)
print("MODEL COMPARISON: Default vs Tuned")
print("="*70)

for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print(f"  MAE:  {metrics['mae_mean']:.2f} (+/- {metrics['mae_std']:.2f})")
    print(f"  MAPE: {metrics['mape_mean']:.2f}% (+/- {metrics['mape_std']:.2f}%)")

# Calculate improvement
default_mae = results['XGBoost_Default']['mae_mean']
tuned_mae = results['XGBoost_Tuned']['mae_mean']
improvement = ((default_mae - tuned_mae) / default_mae) * 100

print("\n" + "="*70)
print(f"IMPROVEMENT: {improvement:.1f}% reduction in MAE")
print("="*70)

## 7. Save Results

In [None]:
# Save best parameters to JSON
output_path = project_root / "data" / "processed" / "best_xgboost_params.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

params_to_save = {
    'best_params': study.best_params,
    'best_mae': study.best_value,
    'n_trials': len(study.trials),
    'default_comparison': {
        'default_mae': results['XGBoost_Default']['mae_mean'],
        'tuned_mae': results['XGBoost_Tuned']['mae_mean'],
        'improvement_pct': improvement
    }
}

with open(output_path, 'w') as f:
    json.dump(params_to_save, f, indent=2)

print(f"Best parameters saved to: {output_path}")

In [None]:
# Display final summary
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"\nBest Parameters:")
for param, value in study.best_params.items():
    if isinstance(value, float):
        print(f"  {param}: {value:.4f}")
    else:
        print(f"  {param}: {value}")

print(f"\nPerformance:")
print(f"  Default MAE:  {results['XGBoost_Default']['mae_mean']:.2f}")
print(f"  Tuned MAE:    {results['XGBoost_Tuned']['mae_mean']:.2f}")
print(f"  Improvement:  {improvement:.1f}%")

print(f"\nDefault MAPE:  {results['XGBoost_Default']['mape_mean']:.2f}%")
print(f"Tuned MAPE:    {results['XGBoost_Tuned']['mape_mean']:.2f}%")

---

**End of Notebook 08 - Hyperparameter Tuning**

Use the best parameters in notebook 07 by updating the XGBoost_Enhanced model configuration.