# Machine Learning Models for Time Series Forecasting
## XGBoost, LightGBM, Random Forest

**Objectives:**
- Train gradient boosting models (XGBoost, LightGBM)
- Train Random Forest
- Analyze feature importance
- Compare with baseline models

**Expected:** MAPE 8-12% with properly tuned models

In [None]:
# Import libraries
import sys
sys.path.insert(0, '../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from models.ml_models import MLForecaster, compare_ml_models
from models.train_test_split import get_X_y_split

plt.style.use('seaborn-v0_8-darkgrid')
print('‚úì Libraries imported')

## 1. Load Feature-Engineered Data

In [None]:
# Load X (features) and y (target)
X = pd.read_csv('../data/processed/X.csv', index_col='date', parse_dates=True)
y = pd.read_csv('../data/processed/y.csv', index_col='date', parse_dates=True).squeeze()

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"Date range: {X.index.min()} to {X.index.max()}")
print(f"\nFeature columns: {X.shape[1]} features")
print(f"Sample features: {X.columns[:10].tolist()}")

## 2. Train/Val/Test Split

In [None]:
# 80/10/10 temporal split
X_train, X_val, X_test, y_train, y_val, y_test = get_X_y_split(
    X, y, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
)

In [None]:
# Visualize split
plt.figure(figsize=(15, 5))
plt.plot(y_train.index, y_train.values, label='Train', linewidth=2, alpha=0.7, color='blue')
plt.plot(y_val.index, y_val.values, label='Validation', linewidth=2, alpha=0.7, color='orange')
plt.plot(y_test.index, y_test.values, label='Test', linewidth=2, alpha=0.7, color='red')
plt.axvline(x=y_val.index[0], color='orange', linestyle='--', alpha=0.5)
plt.axvline(x=y_test.index[0], color='red', linestyle='--', alpha=0.5)
plt.xlabel('Date', fontsize=11)
plt.ylabel('Revenue ($)', fontsize=11)
plt.title('Train/Validation/Test Split for ML Models', fontsize=13, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Train XGBoost

In [None]:
# Initialize and train XGBoost
xgb_forecaster = MLForecaster('xgboost')
xgb_forecaster.train(X_train, y_train, X_val, y_val)

# Evaluate
train_metrics_xgb = xgb_forecaster.evaluate(X_train, y_train, 'Train')
val_metrics_xgb = xgb_forecaster.evaluate(X_val, y_val, 'Validation')
test_metrics_xgb = xgb_forecaster.evaluate(X_test, y_test, 'Test')

In [None]:
# Feature importance (XGBoost)
print("\nTop 20 Features (XGBoost):")
xgb_top_features = xgb_forecaster.get_feature_importance(top_n=20)
print(xgb_top_features)

## 4. Train LightGBM

In [None]:
# Initialize and train LightGBM
lgb_forecaster = MLForecaster('lightgbm')
lgb_forecaster.train(X_train, y_train, X_val, y_val)

# Evaluate
train_metrics_lgb = lgb_forecaster.evaluate(X_train, y_train, 'Train')
val_metrics_lgb = lgb_forecaster.evaluate(X_val, y_val, 'Validation')
test_metrics_lgb = lgb_forecaster.evaluate(X_test, y_test, 'Test')

In [None]:
# Feature importance (LightGBM)
print("\nTop 20 Features (LightGBM):")
lgb_top_features = lgb_forecaster.get_feature_importance(top_n=20)
print(lgb_top_features)

## 5. Train Random Forest

In [None]:
# Initialize and train Random Forest
rf_forecaster = MLForecaster('random_forest')
rf_forecaster.train(X_train, y_train)

# Evaluate
train_metrics_rf = rf_forecaster.evaluate(X_train, y_train, 'Train')
val_metrics_rf = rf_forecaster.evaluate(X_val, y_val, 'Validation')
test_metrics_rf = rf_forecaster.evaluate(X_test, y_test, 'Test')

In [None]:
# Feature importance (Random Forest)
print("\nTop 20 Features (Random Forest):")
rf_top_features = rf_forecaster.get_feature_importance(top_n=20)
print(rf_top_features)

## 6. Model Comparison

In [None]:
# Compare all models
ml_results = pd.DataFrame({
    'xgboost': test_metrics_xgb,
    'lightgbm': test_metrics_lgb,
    'random_forest': test_metrics_rf
}).T

ml_results = ml_results.sort_values('MAPE')

print("\n" + "="*70)
print(" ML MODELS COMPARISON")
print("="*70)
print(ml_results)
print("="*70)

In [None]:
# Metrics visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# MAPE
ml_results_sorted = ml_results.sort_values('MAPE')
axes[0, 0].barh(ml_results_sorted.index, ml_results_sorted['MAPE'], alpha=0.7, color='steelblue')
axes[0, 0].set_xlabel('MAPE (%)', fontsize=11)
axes[0, 0].set_title('Mean Absolute Percentage Error', fontsize=12, fontweight='bold')
axes[0, 0].axvline(x=15, color='red', linestyle='--', alpha=0.5, label='Target < 15%')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3, axis='x')

# RMSE
ml_results_sorted = ml_results.sort_values('RMSE')
axes[0, 1].barh(ml_results_sorted.index, ml_results_sorted['RMSE'], alpha=0.7, color='coral')
axes[0, 1].set_xlabel('RMSE ($)', fontsize=11)
axes[0, 1].set_title('Root Mean Squared Error', fontsize=12, fontweight='bold')
axes[0, 1].axvline(x=500, color='red', linestyle='--', alpha=0.5, label='Target < $500')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3, axis='x')

# MAE
ml_results_sorted = ml_results.sort_values('MAE')
axes[1, 0].barh(ml_results_sorted.index, ml_results_sorted['MAE'], alpha=0.7, color='seagreen')
axes[1, 0].set_xlabel('MAE ($)', fontsize=11)
axes[1, 0].set_title('Mean Absolute Error', fontsize=12, fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='x')

# R¬≤
ml_results_sorted = ml_results.sort_values('R2', ascending=True)
axes[1, 1].barh(ml_results_sorted.index, ml_results_sorted['R2'], alpha=0.7, color='mediumpurple')
axes[1, 1].set_xlabel('R¬≤ Score', fontsize=11)
axes[1, 1].set_title('R-Squared Score', fontsize=12, fontweight='bold')
axes[1, 1].axvline(x=0.85, color='red', linestyle='--', alpha=0.5, label='Target > 0.85')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 7. Forecast Visualization

In [None]:
# Make predictions
y_pred_xgb = xgb_forecaster.predict(X_test)
y_pred_lgb = lgb_forecaster.predict(X_test)
y_pred_rf = rf_forecaster.predict(X_test)

# Plot
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Full timeline
axes[0].plot(y_train.index, y_train.values, label='Train', linewidth=2, color='blue', alpha=0.7)
axes[0].plot(y_test.index, y_test.values, label='Test (Actual)', linewidth=3, color='black', marker='o', markersize=4)
axes[0].plot(y_test.index, y_pred_xgb, label='XGBoost', linewidth=2, marker='s', markersize=3, alpha=0.7, color='red')
axes[0].plot(y_test.index, y_pred_lgb, label='LightGBM', linewidth=2, marker='s', markersize=3, alpha=0.7, color='green')
axes[0].plot(y_test.index, y_pred_rf, label='Random Forest', linewidth=2, marker='s', markersize=3, alpha=0.7, color='orange')
axes[0].axvline(x=y_test.index[0], color='red', linestyle='--', alpha=0.5)
axes[0].set_xlabel('Date', fontsize=11)
axes[0].set_ylabel('Revenue ($)', fontsize=11)
axes[0].set_title('ML Model Forecasts', fontsize=13, fontweight='bold')
axes[0].legend(loc='upper left')
axes[0].grid(alpha=0.3)

# Test period zoomed
axes[1].plot(y_test.index, y_test.values, label='Actual', linewidth=3, color='black', marker='o', markersize=5)
axes[1].plot(y_test.index, y_pred_xgb, label='XGBoost', linewidth=2, marker='s', markersize=4, alpha=0.7, color='red')
axes[1].plot(y_test.index, y_pred_lgb, label='LightGBM', linewidth=2, marker='s', markersize=4, alpha=0.7, color='green')
axes[1].plot(y_test.index, y_pred_rf, label='Random Forest', linewidth=2, marker='s', markersize=4, alpha=0.7, color='orange')
axes[1].set_xlabel('Date', fontsize=11)
axes[1].set_ylabel('Revenue ($)', fontsize=11)
axes[1].set_title('Test Period Forecasts (Zoomed)', fontsize=13, fontweight='bold')
axes[1].legend(loc='upper left')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Importance Comparison

In [None]:
# Compare top 15 features for each model
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

models = [
    ('XGBoost', xgb_forecaster),
    ('LightGBM', lgb_forecaster),
    ('Random Forest', rf_forecaster)
]

for i, (name, model) in enumerate(models):
    top_features = model.get_feature_importance(top_n=15)
    
    axes[i].barh(range(len(top_features)), top_features.values, alpha=0.7)
    axes[i].set_yticks(range(len(top_features)))
    axes[i].set_yticklabels(top_features.index, fontsize=9)
    axes[i].set_xlabel('Importance', fontsize=11)
    axes[i].set_title(f'{name} - Top 15 Features', fontsize=12, fontweight='bold')
    axes[i].invert_yaxis()
    axes[i].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 9. Save Best Model

In [None]:
# Find best model
best_model_name = ml_results['MAPE'].idxmin()
print(f"Best ML Model: {best_model_name.upper()}")
print(f"MAPE: {ml_results.loc[best_model_name, 'MAPE']:.2f}%")

# Save models
xgb_forecaster.save_model('../models/xgboost_model.pkl')
lgb_forecaster.save_model('../models/lightgbm_model.pkl')
rf_forecaster.save_model('../models/random_forest_model.pkl')

print("\n‚úì All models saved!")

## 10. Summary

In [None]:
print("="*70)
print(" ML MODELS SUMMARY")
print("="*70)
print(f"\nFeatures used: {X.shape[1]}")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

print(f"\nModel Rankings (by MAPE):")
for i, (model_name, row) in enumerate(ml_results.iterrows(), 1):
    print(f"  {i}. {model_name.upper():15s} - MAPE: {row['MAPE']:5.2f}%, RMSE: ${row['RMSE']:6.2f}")

print(f"\nüèÜ Best Model: {best_model_name.upper()}")
best_metrics = ml_results.loc[best_model_name]
print(f"   MAPE:  {best_metrics['MAPE']:.2f}%")
print(f"   RMSE:  ${best_metrics['RMSE']:.2f}")
print(f"   MAE:   ${best_metrics['MAE']:.2f}")
print(f"   R¬≤:    {best_metrics['R2']:.4f}")

print("\n" + "="*70)