<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/CB_Step_5_Deep_Learning_and_Hybrid_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scikit-learn tensorflow prophet xgboost neuralprophet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from prophet import Prophet
from xgboost import XGBRegressor
from neuralprophet import NeuralProphet
import torch  # Added for safe_globals

# Fix for UnpicklingError: Import the actual class and add to safe_globals
from neuralprophet.configure import ConfigSeasonality
torch.serialization.add_safe_globals([ConfigSeasonality])

# Step 1: Load the data from CSV file with explicit date format to suppress parsing warning
df = pd.read_csv('updated_final_merged_data.csv', parse_dates=['Date'], date_format='%m/%d/%y', index_col='Date')

# Assume 'Calls' is the target column
target = 'Calls'

# Prepare data: Sort by date if not already
df = df.sort_index()

# Feature Engineering (similar to ML: lags, rollings, dummies)
df['Lag1'] = df[target].shift(1)
df['Lag7'] = df[target].shift(7)
df['Rolling_Mean_7'] = df[target].rolling(window=7).mean()
df['Rolling_Std_7'] = df[target].rolling(window=7).std()
df = pd.get_dummies(df, columns=['DayOfWeek'], drop_first=True)

# Select features (numeric except target)
features = [col for col in df.columns if col != target and df[col].dtype in [np.float64, np.int64, bool]]

# Drop NaNs
df = df.dropna()

# For DL models, scale data
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features + [target]]), index=df.index, columns=features + [target])

# Time series cross-validation: 5 splits
tscv = TimeSeriesSplit(n_splits=5)

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100  # As percentage
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape}

# Dictionary to store average metrics for each model
model_metrics = {}

# Helper to create sequences for LSTM (timesteps=7)
def create_sequences(data, timesteps=7):
    X_seq, y_seq = [], []
    for i in range(len(data) - timesteps):
        X_seq.append(data.iloc[i:i+timesteps].values)
        y_seq.append(data.iloc[i+timesteps][target])  # Predict next value
    return np.array(X_seq), np.array(y_seq)

# 1. LSTM Network
lstm_preds = []
lstm_trues = []
for train_idx, test_idx in tscv.split(df_scaled):
    train = df_scaled.iloc[train_idx]
    test = df_scaled.iloc[test_idx]

    # Create sequences
    X_train_seq, y_train_seq = create_sequences(train)
    X_test_seq, y_test_seq = create_sequences(test)

    # Build LSTM model with Input layer to suppress warning
    model = Sequential()
    model.add(Input(shape=(7, len(train.columns))))
    model.add(LSTM(50, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    # Fit (with verbose=1 for debugging if needed; set to 0 for silent)
    model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, verbose=0)

    # Predict (adjust for sequence length)
    pred = model.predict(X_test_seq, verbose=0).flatten()

    lstm_preds.extend(pred)
    lstm_trues.extend(y_test_seq)

# Inverse scale predictions and trues
lstm_preds_inv = scaler.inverse_transform(np.concatenate((np.zeros((len(lstm_preds), len(features))), np.array(lstm_preds).reshape(-1,1)), axis=1))[:, -1]
lstm_trues_inv = scaler.inverse_transform(np.concatenate((np.zeros((len(lstm_trues), len(features))), np.array(lstm_trues).reshape(-1,1)), axis=1))[:, -1]

lstm_metrics = calculate_metrics(lstm_trues_inv, lstm_preds_inv)
model_metrics['LSTM'] = lstm_metrics

# 2. Neural Prophet (with matplotlib backend to avoid Plotly error)
np_preds = []
np_trues = []
for train_idx, test_idx in tscv.split(df):
    train_df = df.iloc[train_idx].reset_index().rename(columns={'Date': 'ds', 'Calls': 'y'})
    test_df = df.iloc[test_idx].reset_index().rename(columns={'Date': 'ds', 'Calls': 'y'})

    # Fit NeuralProphet with matplotlib and explicit seasonalities
    model = NeuralProphet(epochs=50, batch_size=32, learning_rate=0.01,  # Manual LR to avoid finder warning
                          yearly_seasonality=False, daily_seasonality=False)  # Explicit to suppress auto warnings
    model.set_plotting_backend('matplotlib')
    model.fit(train_df[['ds', 'y']], freq='D')

    # Make future dataframe
    future = model.make_future_dataframe(train_df[['ds', 'y']], periods=len(test_df))

    # Predict
    forecast = model.predict(future)
    pred = forecast['yhat1'].tail(len(test_df)).values

    np_preds.extend(pred)
    np_trues.extend(test_df['y'])

np_metrics = calculate_metrics(np_trues, np_preds)
model_metrics['Neural Prophet'] = np_metrics

# 3. Hybrid: Prophet + XGBoost (Prophet for trend/seasonal, XGBoost on residuals)
hybrid_preds = []
hybrid_trues = []
for train_idx, test_idx in tscv.split(df):
    train_df = df.iloc[train_idx].reset_index().rename(columns={'Date': 'ds', 'Calls': 'y'})
    test_df = df.iloc[test_idx].reset_index().rename(columns={'Date': 'ds', 'Calls': 'y'})

    # Step 1: Fit Prophet
    prophet_model = Prophet(weekly_seasonality=True)
    prophet_model.fit(train_df[['ds', 'y']])

    # Predict on train and test
    train_future = prophet_model.make_future_dataframe(periods=0)
    train_forecast = prophet_model.predict(train_future)
    train_residuals = train_df['y'] - train_forecast['yhat']

    test_future = prophet_model.make_future_dataframe(periods=len(test_df))
    test_forecast = prophet_model.predict(test_future)
    prophet_test_pred = test_forecast['yhat'].tail(len(test_df))

    # Step 2: Fit XGBoost on residuals using features
    X_train = train_df[features]
    y_res_train = train_residuals

    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_res_train)

    # Predict residuals on test
    X_test = test_df[features]
    res_pred = xgb_model.predict(X_test)

    # Combine: Prophet pred + residual pred
    pred = prophet_test_pred + res_pred

    hybrid_preds.extend(pred)
    hybrid_trues.extend(test_df['y'])

hybrid_metrics = calculate_metrics(hybrid_trues, hybrid_preds)
model_metrics['Prophet + XGBoost Hybrid'] = hybrid_metrics

# Summarize performance
print("\nModel Performance Summary:")
metrics_df = pd.DataFrame(model_metrics).T
print(metrics_df)

# Pick winner: Lowest MAE (primary metric)
winner = metrics_df['MAE'].idxmin()
print(f"\nChampion DL/Hybrid Model: {winner}")
print(f"Metrics: {metrics_df.loc[winner].to_dict()}")

**### Combined Performance Summary: Baseline, Classical, Machine Learning, and Deep Learning/Hybrid Models**

To offer a complete evaluation of the forecasting models for call center volume, below are the performance tables for all tiers tested: baselines (simple benchmarks), classical time series (univariate with trend and seasonality), machine learning (multivariate with feature engineering like lags, rollings, day-of-week dummies, and market indicators), and deep learning/hybrids (sequence-based and combined approaches for complex patterns). All models were assessed using time-series cross-validation on the filled dataset, with consistent metrics: Mean Absolute Error (MAE, in call counts), Root Mean Squared Error (RMSE, penalizing larger errors), and Mean Absolute Percentage Error (MAPE, for relative accuracy). This cumulative view tracks progress across increasing complexity, building on EDA insights such as strong weekly seasonality, non-stationarity, outliers, and market correlations (e.g., VIX and CVOL).

#### Baseline Models Performance
| Model          | MAE       | RMSE      | MAPE     |
|----------------|-----------|-----------|----------|
| Naive          | 2351.46  | 2942.38  | 24.84%  |
| Mean           | 1634.56  | 2154.49  | 18.23%  |
| Median         | 1613.91  | 2177.89  | 17.38%  |
| Seasonal Naive | 907.70   | 1359.05  | 9.67%   |

**Baseline Champion**: Seasonal Naive (strong due to weekly patterns).

#### Classical Models Performance
| Model   | MAE       | RMSE      | MAPE     |
|---------|-----------|-----------|----------|
| ARIMA   | 2268.08  | 2860.61  | 24.43%  |
| SARIMA  | 2560.83  | 3163.07  | 28.56%  |
| ETS     | 2233.64  | 2882.92  | 22.57%  |

**Classical Champion**: ETS (modest handling of trends but underperforms baselines).

#### Machine Learning Models Performance
| Model         | MAE       | RMSE      | MAPE     |
|---------------|-----------|-----------|----------|
| Ridge         | 1011.39  | 1392.16  | 10.89%  |
| Random Forest | 1080.18  | 1700.63  | 11.13%  |
| XGBoost       | 1338.68  | 1955.16  | 14.52%  |
| SVR           | 1925.28  | 2607.63  | 20.17%  |

**ML Champion**: Ridge (benefits from regularization and features, nearing baseline levels).

#### Deep Learning/Hybrid Models Performance
| Model                     | MAE       | RMSE      | MAPE     |
|---------------------------|-----------|-----------|----------|
| LSTM                      | 2143.27  | 2893.43  | 24.44%  |
| Neural Prophet            | 3626.81  | 4798.90  | 43.38%  |
| Prophet + XGBoost Hybrid  | 3867.73  | 4738.37  | 44.67%  |

**DL/Hybrid Champion**: LSTM (best in tier for sequence learning, but higher errors overall).

### Full Narrative Analysis
The baseline models provide an essential foundation, demonstrating that even simple methods can effectively capture key patterns in the call volume data. The Naive approach, which persists the last value, yields high errors (MAE ~2,351, MAPE 25%) amid daily fluctuations, while Mean and Median improve modestly (MAEs ~1,614-1,635, MAPEs 17-18%) by focusing on central tendencies, aligning with the EDA's slightly skewed distribution. The Seasonal Naive excels (MAE 908, MAPE under 10%), directly leveraging the EDA's decomposed weekly seasonality and day-of-week variations for reliable periodic forecasts, even after imputing non-business days.

Classical models, emphasizing univariate trends and seasonality, show inconsistent gains. ARIMA addresses non-stationarity (per EDA's ADF test) with an MAE of 2,268 and MAPE of 24%, but overlooks cycles. SARIMA, incorporating weekly terms, performs worst (MAE 2,561, MAPE 29%), possibly overfitting to outliers or filled data noise. ETS offers the best classical results (MAE 2,234, MAPE 23%) through smoothing of trends and additive seasonality, consistent with EDA rolling stats, but remains outperformed by baselines, suggesting limited added value from parameterization.

Machine learning models advance by integrating multivariate features (e.g., lags for autocorrelation, rollings for volatility, dummies for days, and market vars like VIX/CVOL per EDA correlations >0.2), yielding clearer improvements. Ridge leads (MAE 1,011, MAPE 11%) with regularization handling collinearity and outliers. Random Forest (MAE 1,080, MAPE 11%) captures non-linear interactions via ensembles, while XGBoost (MAE 1,339, MAPE 15%) boosts performance but risks overfitting. SVR trails (MAE 1,925, MAPE 20%), less suited for this tabular time-series data. Overall, ML reduces classical errors by ~50% (e.g., Ridge vs. ETS), approaching baseline efficiency through feature-driven predictions.

The deep learning/hybrid tier, designed for complex sequences and combinations, introduces neural architectures but yields mixed outcomes. LSTM, processing scaled sequences with lags, achieves the tier's best (MAE 2,143, MAPE 24%), capturing long dependencies from EDA autocorrelation but struggling with the dataset's scale or noise, performing similarly to classical ARIMA. Neural Prophet, extending Prophet with nets, ranks worst (MAE 3,627, MAPE 43%), potentially due to insufficient data for deep learning or sensitivity to imputation. The Prophet + XGBoost hybrid (MAE 3,868, MAPE 45%) combines trend/seasonality with residual boosting on features, but high errors suggest the univariate base limits multivariate gains.

Cumulatively, errors decrease from baselines/classical (MAEs >2,000) to ML (~1,000), but DL/hybrids regress (>2,000), indicating overfitting, data scarcity for deep models, or suboptimal tuning amid EDA-noted outliers and volatility. The overall champion remains Seasonal Naive (MAPE <10%), as more complex tiers haven't surpassed it—highlighting that the dominant weekly rhythm (per decomposition) favors simplicity. ML's Ridge comes closest, validating feature engineering's value for market ties. To progress, consider hyperparameter tuning, ensembles (e.g., stacking Ridge + Seasonal Naive), or more data; otherwise, deploy the baseline for efficient, interpretable forecasting in call center planning.