<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/CB_Step_4_Machine_Learning_Models_(With_Feature_Engineering).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pandas numpy scikit-learn xgboost



In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

# Load the updated dataset
df = pd.read_csv('enhanced_eda_data.csv', index_col='date', parse_dates=True)

# Assume 'Calls' is the target column
target = 'calls'

# Prepare data: Sort by date if not already
df = df.sort_index()

# Feature Engineering
# Lags: previous day (lag1) and previous week (lag7)
df['Lag1'] = df[target].shift(1)
df['Lag7'] = df[target].shift(7)

# Rolling statistics: 7-day mean and std
df['Rolling_Mean_7'] = df[target].rolling(window=7).mean()
df['Rolling_Std_7'] = df[target].rolling(window=7).std()

# Day-of-week dummies (from EDA)
df = pd.get_dummies(df, columns=['DayOfWeek'], drop_first=True)

# Select market features with |corr| > 0.2 from EDA (adjust based on actual high_corr)
# Assuming from previous: e.g., '^VIX_Close_^VIX', 'CVOL-USD_Close_CVOL-USD', etc.
# For code, select all numeric except target and engineered
features = [col for col in df.columns if col != target and df[col].dtype in [np.float64, np.int64, bool]]

# Drop NaNs from shifting/rolling
df = df.dropna()

# X and y
X = df[features]
y = df[target]

# Time series cross-validation: 5 splits
tscv = TimeSeriesSplit(n_splits=5)

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100  # As percentage
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape}

# Dictionary to store average metrics for each model
model_metrics = {}

# Scaler for models that need it (Ridge, SVR)
scaler = StandardScaler()

# 1. Ridge Regression (linear with L2 regularization)
ridge_preds = []
ridge_trues = []
for train_idx, test_idx in tscv.split(df):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Scale
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit
    model = Ridge(alpha=1.0)
    model.fit(X_train_scaled, y_train)

    # Predict
    pred = model.predict(X_test_scaled)

    ridge_preds.extend(pred)
    ridge_trues.extend(y_test)

ridge_metrics = calculate_metrics(ridge_trues, ridge_preds)
model_metrics['Ridge'] = ridge_metrics

# 2. Random Forest Regressor
rf_preds = []
rf_trues = []
for train_idx, test_idx in tscv.split(df):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Fit (no scaling needed)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict
    pred = model.predict(X_test)

    rf_preds.extend(pred)
    rf_trues.extend(y_test)

rf_metrics = calculate_metrics(rf_trues, rf_preds)
model_metrics['Random Forest'] = rf_metrics

# 3. XGBoost Regressor
xgb_preds = []
xgb_trues = []
for train_idx, test_idx in tscv.split(df):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Fit (no scaling needed)
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    # Predict
    pred = model.predict(X_test)

    xgb_preds.extend(pred)
    xgb_trues.extend(y_test)

xgb_metrics = calculate_metrics(xgb_trues, xgb_preds)
model_metrics['XGBoost'] = xgb_metrics

# 4. Support Vector Regression (SVR)
svr_preds = []
svr_trues = []
for train_idx, test_idx in tscv.split(df):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Scale
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit
    model = SVR(kernel='rbf', C=100, epsilon=0.1)
    model.fit(X_train_scaled, y_train)

    # Predict
    pred = model.predict(X_test_scaled)

    svr_preds.extend(pred)
    svr_trues.extend(y_test)

svr_metrics = calculate_metrics(svr_trues, svr_preds)
model_metrics['SVR'] = svr_metrics

# Summarize performance
print("\nModel Performance Summary:")
metrics_df = pd.DataFrame(model_metrics).T
print(metrics_df)

# Pick winner: Lowest MAE (primary metric)
winner = metrics_df['MAE'].idxmin()
print(f"\nChampion ML Model: {winner}")
print(f"Metrics: {metrics_df.loc[winner].to_dict()}")



Model Performance Summary:
                       MAE         RMSE       MAPE
Ridge           493.337919   624.324098   7.162782
Random Forest   396.581019   948.543361   5.098744
XGBoost         374.221868   825.921138   5.347607
SVR            1997.180295  2693.427873  25.689883

Champion ML Model: XGBoost
Metrics: {'MAE': 374.22186772127327, 'RMSE': 825.9211378157587, 'MAPE': 5.347607421028823}
