<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/Latest_Model_Playoff_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =============================================================================
# STEP 0: ENVIRONMENT SETUP
# This block handles installation in the correct order to prevent conflicts.
# =============================================================================
!pip install numpy==1.26.4 # Pinning numpy version for stability
#!pip uninstall pmdarima statsmodels -y
!pip install pmdarima tensorflow xgboost yfinance -q




In [5]:


# =============================================================================
# MASTER SCRIPT: A 4-Way Model Showdown (Version 8.1 - Unified)
# =============================================================================
import pandas as pd
import numpy as np
import yfinance as yf
import xgboost as xgb
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from datetime import date
import warnings

# Suppress irrelevant warnings
warnings.filterwarnings("ignore")

# --- [SETUP] Data Preparation ---
print("--- [SETUP] Preparing all necessary data... ---")
# 1. Load Client Data
client_data_path = "agent_contact_volume_wgsd2.csv"
df_calls = pd.read_csv(client_data_path)
df_calls.columns = ['Date', 'adjusted_call_volume']
df_calls['Date'] = pd.to_datetime(df_calls['Date'])

# 2. Fetch Market Data
TICKERS = ['BTC-USD', 'ETH-USD', 'SOL-USD', '^VIX']
START_DATE = '2021-01-01'
END_DATE = date.today().strftime('%Y-%m-%d')
full_date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
market_data_df = pd.DataFrame(index=full_date_range)
market_data_df.index.name = 'Date'
TICKER_MAP = {'BTC-USD': 'btc', 'ETH-USD': 'eth', 'SOL-USD': 'sol', '^VIX': 'vix'}
for ticker, name in TICKER_MAP.items():
    asset_raw_data = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False)
    if asset_raw_data.empty:
        raise ValueError(f"No data returned from API for ticker: {ticker}")
    asset_df = asset_raw_data[['Close', 'Volume']]
    asset_df.columns = [f'{name}_price', f'{name}_volume']
    asset_df = asset_df.reindex(full_date_range).ffill().bfill()
    market_data_df = market_data_df.join(asset_df)

# 3. Create Final Training DataFrame with Features
combined_df = pd.merge(market_data_df.reset_index(), df_calls, on='Date', how='inner')
for crypto in ['btc', 'eth', 'sol']:
    combined_df[f'{crypto}_volatility_index'] = combined_df[f'{crypto}_price'].pct_change().rolling(window=14).std()
combined_df.fillna(0, inplace=True)
feature_cols = [col for col in combined_df.columns if col not in ['Date', 'adjusted_call_volume']]
combined_df[feature_cols] = combined_df[feature_cols].shift(1)
combined_df.dropna(inplace=True)
print("✅ Data preparation complete.\n")

# --- Hold out the last 90 days for testing across all models ---
test_period = 90
train_df = combined_df[:-test_period]
test_df = combined_df[-test_period:]
results = {}

# ==============================================================================
# MODEL 1: ARIMA - The Simplest Baseline
# ==============================================================================
print("--- [1/4] Building Model 1: ARIMA (Simple Baseline)... ---")
arima_model = auto_arima(train_df['adjusted_call_volume'], seasonal=False, trace=False, error_action='ignore', suppress_warnings=True, stepwise=True)
arima_predictions = arima_model.predict(n_periods=test_period)
results['ARIMA'] = mean_absolute_error(test_df['adjusted_call_volume'], arima_predictions)
print(f"✅ ARIMA Complete. MAE: {results['ARIMA']:.2f}\n")

# ==============================================================================
# MODEL 2: SARIMA - The Seasonal Baseline
# ==============================================================================
print("--- [2/4] Building Model 2: SARIMA (Seasonal Baseline)... ---")
sarima_model = auto_arima(train_df['adjusted_call_volume'], seasonal=True, m=7, trace=False, error_action='ignore', suppress_warnings=True, stepwise=True)
sarima_predictions = sarima_model.predict(n_periods=test_period)
results['SARIMA'] = mean_absolute_error(test_df['adjusted_call_volume'], sarima_predictions)
print(f"✅ SARIMA Complete. MAE: {results['SARIMA']:.2f}\n")

# ==============================================================================
# MODEL 3: LSTM - The Deep Learning Challenger
# ==============================================================================
print("--- [3/4] Building Model 3: LSTM (Deep Learning Challenger)... ---")
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_volume = scaler.fit_transform(combined_df['adjusted_call_volume'].values.reshape(-1,1))

def create_dataset(dataset, look_back=30):
    X, Y = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        X.append(a)
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

look_back = 30
X, y = create_dataset(scaled_volume, look_back)
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
X_train, X_test = X[:-test_period], X[-test_period:]
y_train, y_test = y[:-test_period], y[-test_period:]

lstm_model = Sequential()
lstm_model.add(LSTM(50, input_shape=(look_back, 1)))
lstm_model.add(Dense(1))
lstm_model.compile(loss='mean_squared_error', optimizer='adam')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0)

lstm_predictions_scaled = lstm_model.predict(X_test, verbose=0)
lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)
results['LSTM'] = mean_absolute_error(test_df['adjusted_call_volume'], lstm_predictions)
print(f"✅ LSTM Complete. MAE: {results['LSTM']:.2f}\n")


# ==============================================================================
# MODEL 4: XGBOOST - The High-Performance External Feature Model
# ==============================================================================
print("--- [4/4] Building Model 4: XGBoost (High-Performance Model)... ---")
features = [col for col in combined_df.columns if col not in ['Date', 'adjusted_call_volume']]
X_train, y_train = train_df[features], train_df['adjusted_call_volume']
X_test, y_test = test_df[features], test_df['adjusted_call_volume']
best_params = {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
results['XGBoost'] = mean_absolute_error(y_test, xgb_predictions)
print(f"✅ XGBoost Complete. MAE: {results['XGBoost']:.2f}\n")


# ==============================================================================
# FINAL COMPARISON & CONCLUSION
# ==============================================================================
print("--- FINAL RESULTS: 4-Way Model Showdown ---")
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Mean Absolute Error (MAE)'])
results_df = results_df.sort_values('Mean Absolute Error (MAE)').reset_index(drop=True)
print(results_df)

best_baseline_mae = results_df[results_df['Model'] != 'XGBoost']['Mean Absolute Error (MAE)'].min()
xgb_mae = results_df[results_df['Model'] == 'XGBoost']['Mean Absolute Error (MAE)'].iloc[0]
improvement = ((best_baseline_mae - xgb_mae) / best_baseline_mae) * 100

print("-" * 45)
print(f"Improvement of XGBoost over best alternative: {improvement:.2f}%")

--- [SETUP] Preparing all necessary data... ---
✅ Data preparation complete.

--- [1/4] Building Model 1: ARIMA (Simple Baseline)... ---
✅ ARIMA Complete. MAE: 1208.48

--- [2/4] Building Model 2: SARIMA (Seasonal Baseline)... ---
✅ SARIMA Complete. MAE: 2299.66

--- [3/4] Building Model 3: LSTM (Deep Learning Challenger)... ---
✅ LSTM Complete. MAE: 1773.67

--- [4/4] Building Model 4: XGBoost (High-Performance Model)... ---
✅ XGBoost Complete. MAE: 1657.04

--- FINAL RESULTS: 4-Way Model Showdown ---
     Model  Mean Absolute Error (MAE)
0    ARIMA                1208.481397
1  XGBoost                1657.040771
2     LSTM                1773.672607
3   SARIMA                2299.655239
---------------------------------------------
Improvement of XGBoost over best alternative: -37.12%
