<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/CB_Step_3_Classical_Time_Series_%26_Machine_Learning_Models_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Step 1: Install and Import Necessary Libraries ---
# Uninstall and reinstall numpy and pmdarima to resolve potential compatibility issues
!pip uninstall -y numpy
!pip install numpy==1.26.4
!pip install pmdarima




Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 



In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import pmdarima as pm
from math import sqrt

# Metrics and CV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

# Classical Models
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet

# Machine Learning Models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet

In [2]:
# --- Step 2: Helper Functions & Data Prep ---

# Helper functions for metrics
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0 if mask.sum() > 0 else np.nan

# Helper functions for classical models
def infer_seasonal_period(y: pd.Series):
    if y.index.freqstr:
        freq = y.index.freqstr.upper()
        if freq in ("D", "B"): return 7
        if "W" in freq: return 52
        if "M" in freq: return 12
    return 7 # Default

def fit_predict_arima(y_train, fh, seasonal_period):
    model = pm.auto_arima(y_train, seasonal=True, m=seasonal_period, suppress_warnings=True, error_action='ignore', stepwise=True)
    return model.predict(n_periods=fh)

def fit_predict_ets(y_train, fh, seasonal_period):
    model = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=seasonal_period, initialization_method='estimated').fit()
    return model.forecast(fh)

def fit_predict_prophet(y_train, fh):
    train_df = y_train.reset_index().rename(columns={'Date': 'ds', 'calls': 'y'})
    model = Prophet()
    model.fit(train_df)
    future = model.make_future_dataframe(periods=len(y_test), freq='D')
    forecast = model.predict(future).tail(len(y_test))
    return forecast['yhat'].values

# Load and prepare data
df = pd.read_csv('enhanced_eda_data.csv', index_col='Date', parse_dates=True)
target = 'calls' # Corrected column name

# Feature Engineering for ML models
df['Lag1'] = df[target].shift(1)
df['Lag7'] = df[target].shift(7)
df['Rolling_Mean_7'] = df[target].rolling(window=7).mean()
df['Rolling_Std_7'] = df[target].rolling(window=7).std()
df = pd.get_dummies(df, columns=['DayOfWeek'], drop_first=True)
df = df.dropna()

features = [col for col in df.columns if col != target and df[col].dtype in ['float64', 'int64', 'bool', 'uint8']]
X = df[features]
y = df[target]

print("Data and functions are ready.")

Data and functions are ready.


In [3]:
# --- Step 2: Helper Functions & Data Prep ---

# Helper functions for metrics
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0 if mask.sum() > 0 else np.nan

# Helper functions for classical models
def infer_seasonal_period(y: pd.Series):
    if y.index.freqstr:
        freq = y.index.freqstr.upper()
        if freq in ("D", "B"): return 7
        if "W" in freq: return 52
        if "M" in freq: return 12
    return 7 # Default

def fit_predict_arima(y_train, fh, seasonal_period):
    model = pm.auto_arima(y_train, seasonal=True, m=seasonal_period, suppress_warnings=True, error_action='ignore', stepwise=True)
    return model.predict(n_periods=fh)

def fit_predict_ets(y_train, fh, seasonal_period):
    model = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=seasonal_period, initialization_method='estimated').fit()
    return model.forecast(fh)

def fit_predict_prophet(y_train, fh):
    train_df = y_train.reset_index().rename(columns={'Date': 'ds', 'calls': 'y'})
    model = Prophet()
    model.fit(train_df)
    future = model.make_future_dataframe(periods=fh, freq='D')
    forecast = model.predict(future).tail(fh)
    return forecast['yhat'].values

# Load and prepare data
df = pd.read_csv('enhanced_eda_data.csv', index_col='Date', parse_dates=True)
target = 'calls' # Corrected column name

# Feature Engineering for ML models
df['Lag1'] = df[target].shift(1)
df['Lag7'] = df[target].shift(7)
df['Rolling_Mean_7'] = df[target].rolling(window=7).mean()
df['Rolling_Std_7'] = df[target].rolling(window=7).std()
df = pd.get_dummies(df, columns=['DayOfWeek'], drop_first=True)
df = df.dropna()

features = [col for col in df.columns if col != target and df[col].dtype in ['float64', 'int64', 'bool', 'uint8']]
X = df[features]
y = df[target]

print("Data and functions are ready.")

Data and functions are ready.


In [7]:
# --- Step 3: Run Cross-Validation and Produce Leaderboard ---

# Helper function for MASE
# def mase(y_true, y_pred, y_train):
#     numerator = np.mean(np.abs(y_true - y_pred))
#     denominator = np.mean(np.abs(y_train[1:] - y_train[:-1]))
#     return numerator / denominator if denominator != 0 else np.nan

# Enhanced: Function to calculate metrics + MASE (scaled to Seasonal Naive)
from sklearn.metrics import mean_absolute_percentage_error # Import MAPE

def calculate_metrics(y_true, y_pred, naive_seasonal_mae=858):  # From Step 2 baseline
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    mase = mae / naive_seasonal_mae  # Relative to seasonal naive benchmark
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'MASE': mase}


tscv = TimeSeriesSplit(n_splits=5)
all_metrics = {}
seasonal_period = infer_seasonal_period(y)

# --- PART 1: Classical Models CV ---
print("--- Running Classical Models (ARIMA, ETS, Prophet) ---")
classical_models = {
    'ARIMA': fit_predict_arima,
    'ETS': fit_predict_ets,
    'Prophet': fit_predict_prophet
}

for name, func in classical_models.items():
    maes = []
    rmses = []
    mapes = []
    mases = []
    print(f"Cross-validating {name}...")
    for train_idx, test_idx in tscv.split(y):
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        fh = len(y_test)

        # ETS and ARIMA need the seasonal period passed in
        if name in ['ETS', 'ARIMA']:
            forecast = func(y_train, fh, seasonal_period=seasonal_period)
        else: # Prophet does not
            forecast = func(y_train, fh)

        metrics = calculate_metrics(y_test.values, forecast) # Convert y_test to numpy array
        maes.append(metrics['MAE'])
        rmses.append(metrics['RMSE'])
        mapes.append(metrics['MAPE'])
        mases.append(metrics['MASE'])

    all_metrics[name] = {'MAE': np.mean(maes), 'RMSE': np.mean(rmses), 'MAPE': np.mean(mapes), 'MASE': np.mean(mases)}
    print(f"{name} Average CV MAE: {np.mean(maes):.2f}")

# --- PART 2: Machine Learning Models CV ---
print("\n--- Running Machine Learning Models (XGBoost, RandomForest, etc.) ---")
ml_models = {
    'XGBoost': XGBRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42)
}
param_grids = {
    'XGBoost': {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]},
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [10, 20]}
}

for name, model in ml_models.items():
    maes = []
    rmses = []
    mapes = []
    mases = []
    print(f"\nRunning GridSearchCV for {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X, y)

    # Get best model and make predictions for CV folds to calculate other metrics
    best_model = grid_search.best_estimator_
    for train_idx, test_idx in tscv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        forecast = best_model.predict(X_test)

        metrics = calculate_metrics(y_test.values, forecast) # Convert y_test to numpy array
        maes.append(metrics['MAE'])
        rmses.append(metrics['RMSE'])
        mapes.append(metrics['MAPE'])
        mases.append(metrics['MASE'])

    all_metrics[name] = {'MAE': np.mean(maes), 'RMSE': np.mean(rmses), 'MAPE': np.mean(mapes), 'MASE': np.mean(mases)}

    best_mae = -grid_search.best_score_
    print(f"Best CV MAE for {name}: {best_mae:.2f}")
    print(f"Best parameters: {grid_search.best_params_}")


# --- PART 3: Unified Leaderboard ---
print("\n" + "="*50 + "\nUNIFIED MODEL LEADERBOARD\n" + "="*50)
leaderboard = pd.DataFrame.from_dict(all_metrics, orient='index').sort_values('MAE')
champion_model = leaderboard.index[0]

print(leaderboard)
print(f"\n🏆 Champion Model: {champion_model} with an average MAE of {leaderboard.iloc[0]['MAE']:.2f}")

--- Running Classical Models (ARIMA, ETS, Prophet) ---
Cross-validating ARIMA...
ARIMA Average CV MAE: 2638.25
Cross-validating ETS...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpfd5mmghi/t5l9cebk.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpfd5mmghi/blqzk008.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=51384', 'data', 'file=/tmp/tmpfd5mmghi/t5l9cebk.json', 'init=/tmp/tmpfd5mmghi/blqzk008.json', 'output', 'file=/tmp/tmpfd5mmghi/prophet_modelb0n_e_z4/prophet_model-20250916184348.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:43:48 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chai

ETS Average CV MAE: 1608.69
Cross-validating Prophet...


DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=64208', 'data', 'file=/tmp/tmpfd5mmghi/yk_avj8s.json', 'init=/tmp/tmpfd5mmghi/_i88h6al.json', 'output', 'file=/tmp/tmpfd5mmghi/prophet_model_hfk57g7/prophet_model-20250916184348.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:43:48 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:43:48 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpfd5mmghi/9iy72itb.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpfd5mmghi/q5m2y0k4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:ru

Prophet Average CV MAE: 3983.69

--- Running Machine Learning Models (XGBoost, RandomForest, etc.) ---

Running GridSearchCV for XGBoost...
Best CV MAE for XGBoost: 1151.87
Best parameters: {'learning_rate': 0.1, 'n_estimators': 100}

Running GridSearchCV for RandomForest...
Best CV MAE for RandomForest: 1017.66
Best parameters: {'max_depth': 20, 'n_estimators': 100}

UNIFIED MODEL LEADERBOARD
                      MAE         RMSE       MAPE      MASE
XGBoost        123.097348   168.181477   1.526670  0.143470
RandomForest   187.407200   295.129169   2.349396  0.218423
ETS           1608.689314  2033.914031  19.316007  1.874929
ARIMA         2638.247377  3229.977527  31.565572  3.074880
Prophet       3983.689730  4424.119265  53.967103  4.642995

🏆 Champion Model: XGBoost with an average MAE of 123.10
