<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/CB_Step_3_Classical_Time_Series_Models_(Tuned)_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
file_path = "/content/updated_final_merged_data.csv"

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from scipy import stats
import math
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

def read_data(file_path):
    """Load and prepare data."""
    try:
        df = pd.read_csv(file_path, index_col='date', parse_dates=True)
        print(f"Loaded real data: {len(df)} observations")
        df = df.sort_index()
        return df
    except FileNotFoundError:
        print("File not found. Generating synthetic sample data for demo.")
        dates = pd.date_range('2023-01-01', periods=978, freq='D')
        trend = 8000 + np.linspace(0, 2000, 978)
        seasonal = 2000 * np.sin(2 * np.pi * np.arange(978) / 7)
        noise = np.random.normal(0, 1500, 978)
        calls = pd.Series(trend + seasonal + noise, index=dates, name='calls').clip(lower=2000)
        df = pd.DataFrame({'calls': calls})
        print(f"Generated synthetic data: {len(df)} observations, mean={df['calls'].mean():.0f}")
        return df

def check_stationarity(y, name='series'):
    """Enhanced stationarity check with ADF and KPSS."""
    adf_result = adfuller(y)
    kpss_result = adfuller(y, regression='c', autolag='AIC')
    print(f"\n--- Stationarity Tests for {name} ---")
    print(f"ADF Test p-value: {adf_result[1]:.4f} (stationary if < 0.05)")
    print(f"KPSS Test p-value: {kpss_result[1]:.4f} (stationary if > 0.05)")
    if adf_result[1] > 0.05:
        print("Series is likely non-stationary (trend present)")
    else:
        print("Series appears stationary")

def rolling_cv_splits(y, n_splits=5, test_size=7):
    """Define rolling TimeSeriesSplit for CV."""
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
    return list(tscv.split(y))

def evaluate_model_cv(y, model_func, model_params, cv_splits, include_mape=True):
    """Enhanced CV evaluation with MAE, RMSE, MAPE."""
    maes, rmses, mapes = [], [], []
    for train_idx, test_idx in cv_splits:
        y_train = y.iloc[train_idx]
        y_test = y.iloc[test_idx]
        try:
            forecast = model_func(y_train, len(y_test), **model_params)
            mae = mean_absolute_error(y_test, forecast)
            rmse = math.sqrt(mean_squared_error(y_test, forecast))
            mape = mean_absolute_percentage_error(y_test, forecast + 1e-8) * 100
            maes.append(mae)
            rmses.append(rmse)
            mapes.append(mape)
        except Exception as e:
            print(f"Model evaluation error: {e}")
            maes.append(float('inf'))
            rmses.append(float('inf'))
            mapes.append(float('inf'))
    return np.mean(maes), np.std(maes), np.mean(rmses), np.std(rmses), np.mean(mapes), np.std(mapes)

def sarimax_forecast(y_train, horizon, order, seasonal_order):
    """Fit SARIMAX and forecast."""
    model = SARIMAX(y_train, order=order, seasonal_order=seasonal_order).fit(disp=False)
    return model.forecast(steps=horizon)

def ets_forecast(y_train, horizon, ets_config):
    """Fit ETS and forecast."""
    model = ExponentialSmoothing(y_train, **ets_config).fit(optimized=True)
    return model.forecast(steps=horizon)

def seasonal_naive_forecast(y_train, horizon, period=7):
    """Seasonal Naive with day-of-week mean fallback, no detrending."""
    full_len = len(y_train) + horizon
    extended = pd.Series(index=range(full_len), dtype=float)
    extended[:len(y_train)] = y_train.values
    forecast = extended.shift(period).iloc[-horizon:]
    y_train_df = pd.DataFrame({'calls': y_train, 'dow': y_train.index.dayofweek})
    dow_means = y_train_df.groupby('dow')['calls'].mean()
    for i, idx in enumerate(forecast.index):
        if pd.isna(forecast.iloc[i]):
            dow = (y_train.index[-1].dayofweek + (i % 7) + 1) % 7
            forecast.iloc[i] = dow_means.get(dow, y_train.mean())
    return forecast.values

def mean_forecast(y_train, horizon):
    """Simple mean forecast."""
    return np.full(horizon, y_train.mean())

def diagnose_residuals(residuals, model_name):
    """Enhanced residual diagnostics."""
    print(f"\n--- Residual Diagnostics: {model_name} ---")
    residuals = residuals.dropna()
    if len(residuals) < 10:
        print("Insufficient residuals for diagnostics.")
        return
    lb_stat, lb_pval = acorr_ljungbox(residuals, lags=min(10, len(residuals)//4), return_df=True).iloc[-1][['lb_stat', 'lb_pvalue']]
    print(f"Ljung-Box test p-value: {lb_pval:.4f} (no autocorr if > 0.05)")
    shapiro_stat, shapiro_pval = stats.shapiro(residuals[:5000])
    print(f"Shapiro-Wilk test p-value: {shapiro_pval:.4f} (normal if > 0.05)")
    check_stationarity(residuals, f'{model_name} residuals')

def analyze_errors_by_dow(y, predictions, model_name):
    """Business context: Error analysis by day of week."""
    df_err = pd.DataFrame({'actual': y, 'pred': predictions, 'error': np.abs(y - predictions)})
    df_err['dow'] = df_err.index.dayofweek
    dow_means = df_err.groupby('dow')['error'].mean()
    print(f"\n--- {model_name}: Mean Absolute Errors by Day of Week ---")
    print(dow_means)

def main():
    """Main function for Step 3, using stable GridSearchCV-inspired parameters."""
    try:
        file_path = "updated_final_merged_data.csv"
        df = read_data(file_path)
        y = df['calls']

        print(f"Successfully loaded data with {len(y)} observations")
        print(f"Date range: {y.index[0]} to {y.index[-1]}")

        check_stationarity(y)

        cv_splits = rolling_cv_splits(y, n_splits=5, test_size=7)
        print(f"\nUsing {len(cv_splits)} CV splits with 7-day horizon on full data")

        # Use stable parameters (adjusted from GridSearchCV for robustness)
        best_sarimax_order = (1, 1, 1)  # More stable than (1,2,2)
        best_seasonal_order = (1, 0, 1, 7)
        best_ets_config = {'trend': 'mul', 'seasonal': 'mul', 'seasonal_periods': 7}

        print("\n" + "="*60)
        print("CROSS-VALIDATION EVALUATION (using stable parameters)")
        print("="*60)

        models_to_test = {
            'SARIMAX': {'func': sarimax_forecast, 'params': {'order': best_sarimax_order, 'seasonal_order': best_seasonal_order}},
            'ETS': {'func': ets_forecast, 'params': {'ets_config': best_ets_config}},
            'Seasonal Naive': {'func': seasonal_naive_forecast, 'params': {}},
            'Mean Forecast': {'func': mean_forecast, 'params': {}}
        }

        results = {}
        for name, model_info in models_to_test.items():
            print(f"\nEvaluating {name}...")
            mae_mean, mae_std, rmse_mean, rmse_std, mape_mean, mape_std = evaluate_model_cv(
                y, model_info['func'], model_info['params'], cv_splits, include_mape=True
            )
            results[name] = {
                'MAE_mean': mae_mean, 'MAE_std': mae_std,
                'RMSE_mean': rmse_mean, 'RMSE_std': rmse_std,
                'MAPE_mean': mape_mean, 'MAPE_std': mape_std
            }

        # Display results
        print("\n" + "="*60)
        print("CROSS-VALIDATION RESULTS")
        print("="*60)

        print(f"{'Model':<15} | {'MAE (±std)':<15} | {'RMSE (±std)':<15} | {'MAPE (±std)':<12}")
        print("-" * 70)

        sorted_models = sorted(results.items(), key=lambda x: x[1]['MAE_mean'])
        for name, metrics in sorted_models:
            mae_str = f"{metrics['MAE_mean']:.2f} (±{metrics['MAE_std']:.2f})"
            rmse_str = f"{metrics['RMSE_mean']:.2f} (±{metrics['RMSE_std']:.2f})"
            mape_str = f"{metrics['MAPE_mean']:.2f}% (±{metrics['MAPE_std']:.2f}%)"
            print(f"{name:<15} | {mae_str:<15} | {rmse_str:<15} | {mape_str:<12}")

        # Final diagnostics
        print("\n" + "="*60)
        print("FINAL MODEL DIAGNOSTICS (on last 100 days)")
        print("="*60)

        recent_data = y.iloc[-100:]
        if best_sarimax_order:
            sarimax_model = SARIMAX(recent_data, order=best_sarimax_order, seasonal_order=best_seasonal_order).fit(disp=False)
            diagnose_residuals(sarimax_model.resid, 'SARIMAX')
            analyze_errors_by_dow(recent_data, sarimax_model.fittedvalues, 'SARIMAX')
        if best_ets_config:
            ets_model = ExponentialSmoothing(recent_data, **best_ets_config).fit(optimized=True)
            diagnose_residuals(ets_model.resid, 'ETS')
            analyze_errors_by_dow(recent_data, ets_model.fittedvalues, 'ETS')

        print("\n=== Revised Validation Complete ===")
        print(f"Best performing model: {sorted_models[0][0]} (MAE: {sorted_models[0][1]['MAE_mean']:.0f})")

        pd.DataFrame(results).T.to_csv('step_3_metrics.csv')
        print("\nResults saved to 'step_3_metrics.csv'")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


Loaded real data: 978 observations
Successfully loaded data with 978 observations
Date range: 2023-01-01 00:00:00 to 2025-09-04 00:00:00

--- Stationarity Tests for series ---
ADF Test p-value: 0.1477 (stationary if < 0.05)
KPSS Test p-value: 0.1477 (stationary if > 0.05)
Series is likely non-stationary (trend present)

Using 5 CV splits with 7-day horizon on full data

CROSS-VALIDATION EVALUATION (using stable parameters)

Evaluating SARIMAX...

Evaluating ETS...

Evaluating Seasonal Naive...

Evaluating Mean Forecast...

CROSS-VALIDATION RESULTS
Model           | MAE (±std)      | RMSE (±std)     | MAPE (±std) 
----------------------------------------------------------------------
Seasonal Naive  | 808.80 (±360.19) | 1186.84 (±673.14) | 16.14% (±16.97%)
ETS             | 879.32 (±450.60) | 1191.35 (±757.23) | 17.48% (±16.66%)
SARIMAX         | 925.28 (±380.40) | 1206.60 (±686.18) | 18.58% (±16.20%)
Mean Forecast   | 1520.57 (±461.57) | 1952.39 (±539.05) | 29.92% (±18.30%)

FINAL MODE