# Baseline Time Series Models
## Training Simple to Classical Forecasting Models

**Models to Train:**
1. Naive (Persistence)
2. Moving Average (3, 7, 14, 28 days)
3. Seasonal Naive (weekly)
4. ARIMA
5. SARIMA (with weekly seasonality)

**Target Metrics:**
- MAPE < 15%
- RMSE < $500
- R¬≤ > 0.85

In [None]:
# Import libraries
import sys
sys.path.insert(0, '../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from models.baseline_models import BaselineModels
from models.train_test_split import create_time_series_split

plt.style.use('seaborn-v0_8-darkgrid')
print('‚úì Libraries imported')

## 1. Load Daily Revenue Data

In [None]:
# Load processed daily revenue
df = pd.read_csv('../data/processed/daily_revenue.csv',
                 index_col='date', parse_dates=True)

revenue = df['revenue']

print(f"Dataset: {len(revenue)} days")
print(f"Date range: {revenue.index.min()} to {revenue.index.max()}")
print(f"\nRevenue statistics:")
print(revenue.describe())

In [None]:
# Plot revenue
plt.figure(figsize=(15, 5))
plt.plot(revenue.index, revenue.values, linewidth=2, alpha=0.7)
plt.xlabel('Date', fontsize=11)
plt.ylabel('Revenue ($)', fontsize=11)
plt.title('Daily Revenue Time Series', fontsize=13, fontweight='bold')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 2. Train/Test Split (Temporal)

**CRITICAL:** Use temporal split only - NO SHUFFLING!

In [None]:
# 80/10/10 split
train_df, val_df, test_df = create_time_series_split(
    df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
)

train = train_df['revenue']
val = val_df['revenue']
test = test_df['revenue']

# Combine train+val for final baseline training
train_combined = pd.concat([train, val])

In [None]:
# Visualize split
plt.figure(figsize=(15, 5))
plt.plot(train.index, train.values, label='Train', linewidth=2, alpha=0.7, color='blue')
plt.plot(val.index, val.values, label='Validation', linewidth=2, alpha=0.7, color='orange')
plt.plot(test.index, test.values, label='Test', linewidth=2, alpha=0.7, color='red')
plt.axvline(x=val.index[0], color='orange', linestyle='--', alpha=0.5)
plt.axvline(x=test.index[0], color='red', linestyle='--', alpha=0.5)
plt.xlabel('Date', fontsize=11)
plt.ylabel('Revenue ($)', fontsize=11)
plt.title('Train/Validation/Test Split', fontsize=13, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Train All Baseline Models

In [None]:
# Initialize baseline models
baselines = BaselineModels()

# Train all models
results = baselines.train_all_baselines(train_combined, test)

## 4. Model Comparison

In [None]:
# Display results table
print("\nBaseline Model Results:")
print("="*80)
results_display = results.copy()
results_display.index = results_display.index.str.upper()
results_display

In [None]:
# Metrics comparison plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# MAPE
results_sorted = results.sort_values('MAPE')
axes[0, 0].barh(results_sorted.index, results_sorted['MAPE'], alpha=0.7, color='steelblue')
axes[0, 0].set_xlabel('MAPE (%)', fontsize=11)
axes[0, 0].set_title('Mean Absolute Percentage Error', fontsize=12, fontweight='bold')
axes[0, 0].axvline(x=15, color='red', linestyle='--', alpha=0.5, label='Target < 15%')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3, axis='x')

# RMSE
results_sorted = results.sort_values('RMSE')
axes[0, 1].barh(results_sorted.index, results_sorted['RMSE'], alpha=0.7, color='coral')
axes[0, 1].set_xlabel('RMSE ($)', fontsize=11)
axes[0, 1].set_title('Root Mean Squared Error', fontsize=12, fontweight='bold')
axes[0, 1].axvline(x=500, color='red', linestyle='--', alpha=0.5, label='Target < $500')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3, axis='x')

# MAE
results_sorted = results.sort_values('MAE')
axes[1, 0].barh(results_sorted.index, results_sorted['MAE'], alpha=0.7, color='seagreen')
axes[1, 0].set_xlabel('MAE ($)', fontsize=11)
axes[1, 0].set_title('Mean Absolute Error', fontsize=12, fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='x')

# R¬≤
results_sorted = results.sort_values('R2', ascending=True)
axes[1, 1].barh(results_sorted.index, results_sorted['R2'], alpha=0.7, color='mediumpurple')
axes[1, 1].set_xlabel('R¬≤ Score', fontsize=11)
axes[1, 1].set_title('R-Squared Score', fontsize=12, fontweight='bold')
axes[1, 1].axvline(x=0.85, color='red', linestyle='--', alpha=0.5, label='Target > 0.85')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 5. Forecast Visualization

In [None]:
# Plot forecasts
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot 1: Full timeline
axes[0].plot(train_combined.index, train_combined.values, 
             label='Train', linewidth=2, color='blue', alpha=0.7)
axes[0].plot(test.index, test.values, 
             label='Test (Actual)', linewidth=3, color='black', marker='o', markersize=4)

# Plot selected forecasts
for model_name in ['naive', 'ma_7', 'seasonal_naive', 'sarima']:
    forecast = baselines.get_forecast(model_name)
    if forecast is not None:
        axes[0].plot(forecast.index, forecast.values,
                    label=model_name.upper(), linewidth=2, alpha=0.7)

axes[0].axvline(x=test.index[0], color='red', linestyle='--', alpha=0.5)
axes[0].set_xlabel('Date', fontsize=11)
axes[0].set_ylabel('Revenue ($)', fontsize=11)
axes[0].set_title('Baseline Model Forecasts', fontsize=13, fontweight='bold')
axes[0].legend(loc='upper left')
axes[0].grid(alpha=0.3)

# Plot 2: Test period zoomed
axes[1].plot(test.index, test.values, 
             label='Actual', linewidth=3, color='black', marker='o', markersize=5)

colors = ['blue', 'green', 'orange', 'red', 'purple']
for i, model_name in enumerate(['naive', 'ma_7', 'seasonal_naive', 'arima', 'sarima']):
    forecast = baselines.get_forecast(model_name)
    if forecast is not None:
        axes[1].plot(forecast.index, forecast.values,
                    label=model_name.upper(), linewidth=2, 
                    marker='s', markersize=3, alpha=0.7, color=colors[i])

axes[1].set_xlabel('Date', fontsize=11)
axes[1].set_ylabel('Revenue ($)', fontsize=11)
axes[1].set_title('Test Period Forecasts (Zoomed)', fontsize=13, fontweight='bold')
axes[1].legend(loc='upper left')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Best Model Analysis

In [None]:
# Find best model
best_model = results['MAPE'].idxmin()
best_metrics = results.loc[best_model]

print("="*70)
print(f" üèÜ BEST BASELINE MODEL: {best_model.upper()}")
print("="*70)
print(f"\nMetrics:")
print(f"  MAPE:  {best_metrics['MAPE']:.2f}%")
print(f"  RMSE:  ${best_metrics['RMSE']:.2f}")
print(f"  MAE:   ${best_metrics['MAE']:.2f}")
print(f"  R¬≤:    {best_metrics['R2']:.4f}")
print(f"  MBD:   ${best_metrics['MBD']:.2f}")

# Check if meets targets
print(f"\nTarget Achievement:")
print(f"  MAPE < 15%:  {'‚úì PASS' if best_metrics['MAPE'] < 15 else '‚úó FAIL'}")
print(f"  RMSE < $500: {'‚úì PASS' if best_metrics['RMSE'] < 500 else '‚úó FAIL'}")
print(f"  R¬≤ > 0.85:   {'‚úì PASS' if best_metrics['R2'] > 0.85 else '‚úó FAIL'}")

In [None]:
# Plot best model prediction errors
best_forecast = baselines.get_forecast(best_model)
errors = best_forecast.values - test.values

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Error over time
axes[0].plot(test.index, errors, marker='o', linewidth=2, color='red')
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[0].set_xlabel('Date', fontsize=11)
axes[0].set_ylabel('Prediction Error ($)', fontsize=11)
axes[0].set_title(f'{best_model.upper()} - Prediction Errors Over Time', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3)

# Error distribution
axes[1].hist(errors, bins=15, alpha=0.7, color='steelblue', edgecolor='black')
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero Error')
axes[1].axvline(x=errors.mean(), color='orange', linestyle='--', linewidth=2, label=f'Mean: ${errors.mean():.2f}')
axes[1].set_xlabel('Prediction Error ($)', fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].set_title('Error Distribution', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Summary

In [None]:
print("="*70)
print(" BASELINE MODELS SUMMARY")
print("="*70)
print(f"\nTraining samples: {len(train_combined)}")
print(f"Test samples: {len(test)}")
print(f"Models trained: {len(results)}")

print(f"\nTop 3 Models:")
for i, (model_name, row) in enumerate(results.nsmallest(3, 'MAPE').iterrows(), 1):
    print(f"  {i}. {model_name.upper():15s} - MAPE: {row['MAPE']:5.2f}%, RMSE: ${row['RMSE']:6.2f}")

print(f"\nüèÜ Best Model: {best_model.upper()}")
print(f"   Beats MAPE target: {best_metrics['MAPE']:.2f}% < 15%")
print(f"   Beats RMSE target: ${best_metrics['RMSE']:.2f} < $500")

print("\n" + "="*70)