<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/Gemini_enhanced_EDA_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# course. I've ap B plied all the expert corrections to your script.
#his updated version incorporates fixes for data leakage, improves API robustness, significantly boosts performance by removing the slow .apply() call, and ensures reproducibility.
# you requested, I've enhanced the hard stop after the data collection step. It now performs an explicit validation check—if any data is missing after the API pull, it will print a failure message and exit. If the data is complete, it will confirm success and then pause for your manual review before proceeding.
## Master Script: Final Version with All Corrections (v5.0)
#ere is the complete, production-ready script with all improvements integrated.
# =============================================================================
# STEP 0: INSTALL REQUIRED LIBRARIES
# =============================================================================
!pip install yfinance -q
!pip install xgboost -q
!pip install ydata-profiling -q

# =============================================================================
# MASTER SCRIPT: From Data Collection to Live Forecasting (Version 5.0 - Production Ready)
# This script incorporates robustness checks, data leakage prevention, and performance enhancements.
# =============================================================================

import pandas as pd
import numpy as np
import yfinance as yf
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from datetime import date, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# Set a random seed for reproducible results in simulation and modeling
np.random.seed(42)

# --- Integrated EDA Function ---
def run_comprehensive_eda(df, target_variable=None):
    """Performs a comprehensive EDA on a pandas DataFrame and generates a report."""
    print("🚀 Starting Comprehensive Exploratory Data Analysis...")
    profile = ProfileReport(df, title="Comprehensive EDA Report", explorative=True)
    profile.to_file("training_data_eda_report.html")
    print("\n✅ Success! Detailed report saved as 'training_data_eda_report.html'")

# --- MASTER CONFIGURATION ---
# Data Collection & Simulation
TICKERS = ['BTC-USD', 'ETH-USD', 'SOL-USD', '^VIX']
START_DATE = '2021-01-01'
FORECAST_LOOKBACK_DAYS = 45 # Safely larger than the longest feature window (30)

# Feature Engineering
VOLATILITY_WINDOW = 14
SPIKE_WINDOW = 30
SPIKE_THRESHOLD = 2.0

# Model Tuning
TUNING_ITERATIONS = 50

# Final Output Files
MARKET_DATA_FILE = 'crypto_price_and_volume_data_2021_present.csv'
SIMULATED_CALLS_FILE = 'final_simulated_data_with_volatility_2021_present.csv'
ADVANCED_TRAINING_FILE = 'advanced_feature_training_data_2021_present.csv'
FINAL_MODEL_FILENAME = 'final_advanced_xgboost_model_2021_present.json'


# ==============================================================================
# STEP 1: DATA COLLECTION - Fetch all required price and volume data
# ==============================================================================
print("--- [STEP 1/8] Starting: Data Collection ---")
try:
    END_DATE = date.today().strftime('%Y-%m-%d')
    raw_data = yf.download(TICKERS, start=START_DATE, end=END_DATE, progress=False)

    full_date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
    market_data_df = pd.DataFrame(index=full_date_range)

    TICKER_MAP = {'BTC-USD': 'btc', 'ETH-USD': 'eth', 'SOL-USD': 'sol', '^VIX': 'vix'}

    for ticker, name in TICKER_MAP.items():
        price_series = raw_data[('Close', ticker)]
        volume_series = raw_data[('Volume', ticker)]
        asset_df = pd.DataFrame({f'{name}_price': price_series, f'{name}_volume': volume_series})
        # CORRECTION: Use only ffill() to prevent using future data to fill past NaNs
        asset_df = asset_df.reindex(full_date_range).ffill()
        market_data_df = market_data_df.join(asset_df)

    market_data_df.index.name = 'Date'

    # --- ENHANCED HARD STOP & VALIDATION ---
    print("\n--- [DATA REVIEW & VALIDATION] ---")
    print(f"Date Range: {market_data_df.index.min().strftime('%Y-%m-%d')} to {market_data_df.index.max().strftime('%Y-%m-%d')}")
    print(f"Total Days (Rows) Collected: {market_data_df.shape[0]}")
    # CORRECTION: Explicitly check for any nulls after the API call.
    if market_data_df.isnull().sum().sum() > 0:
        print("\n❌ CRITICAL FAILURE: Missing data detected after API call and forward-fill.")
        print("This indicates a problem with the source data. The script cannot proceed.")
        print(market_data_df.isnull().sum())
        exit()
    else:
        print("\n✅ Data Integrity Check Passed: No missing values found.")
        market_data_df.to_csv(MARKET_DATA_FILE)
        print(f"✅ Success! Market data saved to '{MARKET_DATA_FILE}'")
        input("\n--- PAUSED --- \nData is complete. Press Enter to proceed to Step 2 or Ctrl+C to exit.\n")

except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 2: SIMULATION - Create the intelligent, volatility-driven call volume
# ==============================================================================
print(f"--- [STEP 2/8] Starting: Call Volume Simulation ---")
try:
    df_sim = pd.read_csv(MARKET_DATA_FILE, parse_dates=['Date'])

    NUMBER_OF_AGENTS = 3000
    AVG_WEEKDAY_CALLS_PER_AGENT = 110
    AVG_WEEKEND_CALLS_PER_AGENT = 50
    RANDOMNESS_FACTOR = 15000

    def simulate_base_volume(d):
        mean_vol = NUMBER_OF_AGENTS * (AVG_WEEKDAY_CALLS_PER_AGENT if d.dayofweek < 5 else AVG_WEEKEND_CALLS_PER_AGENT)
        return abs(int(np.random.normal(loc=mean_vol, scale=RANDOMNESS_FACTOR)))
    df_sim['base_call_volume'] = df_sim['Date'].apply(simulate_base_volume)

    VOLATILITY_THRESHOLD = 0.05
    VOLATILITY_MULTIPLIER = 1.8
    VIX_FEAR_THRESHOLD = 30
    VIX_MULTIPLIER = 1.4

    for crypto in ['btc', 'eth', 'sol']:
        df_sim[f'{crypto}_price_pct_change'] = df_sim[f'{crypto}_price'].pct_change()
    df_sim.fillna(0, inplace=True)

    # CORRECTION: Replaced slow .apply() with fast, vectorized operations
    final_volume = df_sim['base_call_volume'].copy()
    is_crypto_volatile = (df_sim['btc_price_pct_change'].abs() > VOLATILITY_THRESHOLD) | \
                         (df_sim['eth_price_pct_change'].abs() > VOLATILITY_THRESHOLD) | \
                         (df_sim['sol_price_pct_change'].abs() > VOLATILITY_THRESHOLD)
    is_vix_high = df_sim['vix_price'] > VIX_FEAR_THRESHOLD
    final_volume[is_crypto_volatile] *= VOLATILITY_MULTIPLIER
    final_volume[is_vix_high] *= VIX_MULTIPLIER
    df_sim['adjusted_call_volume'] = final_volume.astype(int)

    simulated_df = df_sim[['Date', 'btc_price', 'eth_price', 'sol_price', 'vix_price', 'adjusted_call_volume']]
    simulated_df.to_csv(SIMULATED_CALLS_FILE, index=False)
    print(f"✅ Success! Simulated call volume data saved to '{SIMULATED_CALLS_FILE}'\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 3: ADVANCED FEATURE ENGINEERING - Create the final training data
# ==============================================================================
print(f"--- [STEP 3/8] Starting: Advanced Feature Engineering ---")
try:
    df_market = pd.read_csv(MARKET_DATA_FILE, parse_dates=['Date'])
    df_calls = pd.read_csv(SIMULATED_CALLS_FILE, parse_dates=['Date'])

    for crypto in ['btc', 'eth', 'sol']:
        df_market[f'{crypto}_price_pct_change'] = df_market[f'{crypto}_price'].pct_change()
        df_market[f'{crypto}_volatility_index'] = df_market[f'{crypto}_price_pct_change'].rolling(window=VOLATILITY_WINDOW).std()
        rolling_vol_mean = df_market[f'{crypto}_volume'].rolling(window=SPIKE_WINDOW).mean()
        rolling_vol_std = df_market[f'{crypto}_volume'].rolling(window=SPIKE_WINDOW).std()
        df_market[f'{crypto}_volume_spike'] = (df_market[f'{crypto}_volume'] > (rolling_vol_mean + SPIKE_THRESHOLD * rolling_vol_std)).astype(int)
        rolling_price_mean = df_market[f'{crypto}_price_pct_change'].rolling(window=SPIKE_WINDOW).mean()
        rolling_price_std = df_market[f'{crypto}_price_pct_change'].rolling(window=SPIKE_WINDOW).std()
        df_market[f'{crypto}_price_shock'] = (abs(df_market[f'{crypto}_price_pct_change']) > (rolling_price_mean + SPIKE_THRESHOLD * rolling_price_std)).astype(int)

    df_calls_subset = df_calls[['Date', 'adjusted_call_volume']]
    combined_df = pd.merge(df_market, df_calls_subset, on='Date', how='inner')
    combined_df.drop(columns=[col for col in combined_df.columns if 'pct_change' in col], inplace=True)
    combined_df.fillna(0, inplace=True)

    # CORRECTION: Lag all features by 1 day to prevent data leakage.
    # This ensures we only use yesterday's data to predict today's volume.
    feature_cols = [col for col in combined_df.columns if col not in ['Date', 'adjusted_call_volume']]
    combined_df[feature_cols] = combined_df[feature_cols].shift(1)
    combined_df.dropna(inplace=True) # Drop the first row which is now NaN

    combined_df.to_csv(ADVANCED_TRAINING_FILE, index=False)
    print(f"✅ Success! Advanced training file saved as '{ADVANCED_TRAINING_FILE}'\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 4: EXPLORATORY DATA ANALYSIS - Analyze the training data
# ==============================================================================
print(f"--- [STEP 4/8] Starting: Exploratory Data Analysis ---")
try:
    df_eda = pd.read_csv(ADVANCED_TRAINING_FILE, parse_dates=['Date'])
    run_comprehensive_eda(df_eda, target_variable='adjusted_call_volume')
    input("\n--- PAUSED --- \nReview the EDA report ('training_data_eda_report.html'). Press Enter to proceed.\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()


# ==============================================================================
# STEP 5: MODEL TUNING - Find the best hyperparameters
# ==============================================================================
print(f"--- [STEP 5/8] Starting: Automated Model Tuning (this may take several minutes) ---")
try:
    df_tune = pd.read_csv(ADVANCED_TRAINING_FILE, parse_dates=['Date'])
    df_tune.set_index('Date', inplace=True)
    df_tune.fillna(0, inplace=True)
    X = df_tune.drop('adjusted_call_volume', axis=1)
    y = df_tune['adjusted_call_volume']

    param_grid = {
        'n_estimators': [100, 300, 500, 700], 'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.1], 'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
    }
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='mae')
    tscv = TimeSeriesSplit(n_splits=5)
    random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, n_iter=TUNING_ITERATIONS,
                                   scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1, verbose=1, random_state=42)
    random_search.fit(X, y)

    best_params = random_search.best_params_
    best_score = -random_search.best_score_

    print("\n--- Tuning Complete ---")
    print(f"✅ Best MAE Score found: {best_score:.2f}")
    print(f"✅ Best Hyperparameters found: {best_params}\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 6: FINAL MODEL TRAINING - Build the model with the best settings
# ==============================================================================
print(f"--- [STEP 6/8] Starting: Final Model Training ---")
try:
    final_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        eval_metric='mae',
        **best_params,
        random_state=42
    )
    final_model.fit(X, y)
    print("✅ Success! Final model trained on all available data.\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 7: SAVE THE MODEL - Create the final, predictive asset
# ==============================================================================
print(f"--- [STEP 7/8] Starting: Saving the Final Model ---")
try:
    final_model.save_model(FINAL_MODEL_FILENAME)
    print(f"✅ Success! Final model has been saved as '{FINAL_MODEL_FILENAME}'.\n")
except Exception as e:
    print(f"❌ FAILED: {e}")
    exit()

# ==============================================================================
# STEP 8: MAKE A PREDICTION - Use the model for a live forecast
# ==============================================================================
print(f"--- [STEP 8/8] Starting: Making a Live Forecast ---")
try:
    # CORRECTION: Use config variable for lookback instead of a "magic number"
    forecast_start_date = date.today() - timedelta(days=FORECAST_LOOKBACK_DAYS)
    forecast_end_date = date.today()
    raw_forecast_data = yf.download(TICKERS, start=forecast_start_date, end=forecast_end_date, progress=False)

    forecast_full_range = pd.date_range(start=forecast_start_date, end=forecast_end_date, freq='D')
    market_df = pd.DataFrame(index=forecast_full_range)

    for ticker, name in TICKER_MAP.items():
        price_series = raw_forecast_data[('Close', ticker)]
        volume_series = raw_forecast_data[('Volume', ticker)]
        asset_df = pd.DataFrame({f'{name}_price': price_series, f'{name}_volume': volume_series})
        asset_df = asset_df.reindex(forecast_full_range).ffill() # Use ffill only
        market_df = market_df.join(asset_df)

    market_df.index.name = 'Date'

    # Re-create features for the prediction data
    for crypto in ['btc', 'eth', 'sol']:
        market_df[f'{crypto}_price_pct_change'] = market_df[f'{crypto}_price'].pct_change()
        market_df[f'{crypto}_volatility_index'] = market_df[f'{crypto}_price_pct_change'].rolling(window=VOLATILITY_WINDOW).std()
        rolling_vol_mean = market_df[f'{crypto}_volume'].rolling(window=SPIKE_WINDOW).mean()
        rolling_vol_std = market_df[f'{crypto}_volume'].rolling(window=SPIKE_WINDOW).std()
        market_df[f'{crypto}_volume_spike'] = (market_df[f'{crypto}_volume'] > (rolling_vol_mean + SPIKE_THRESHOLD * rolling_vol_std)).astype(int)
        rolling_price_mean = market_df[f'{crypto}_price_pct_change'].rolling(window=SPIKE_WINDOW).mean()
        rolling_price_std = market_df[f'{crypto}_price_pct_change'].rolling(window=SPIKE_WINDOW).std()
        market_df[f'{crypto}_price_shock'] = (abs(market_df[f'{crypto}_price_pct_change']) > (rolling_price_mean + SPIKE_THRESHOLD * rolling_price_std)).astype(int)

    # Note: We do NOT lag the prediction input, as we are predicting for the *next* day.
    # The final row of features is complete based on today's closing data.
    prediction_input = market_df.tail(1)
    training_columns = [col for col in X.columns]
    prediction_input = prediction_input[training_columns]

    loaded_model = xgb.XGBRegressor()
    loaded_model.load_model(FINAL_MODEL_FILENAME)

    prediction = loaded_model.predict(prediction_input)
    predicted_volume = int(prediction[0])
    latest_day = prediction_input.index[0]

    print("\n--- Forecast Complete ---")
    print(f"✅ Based on market data from {latest_day.strftime('%Y-%m-%d')}, the model forecasts a call volume for the next day of: {predicted_volume:,}")
except Exception as e:
    print(f"❌ FAILED: {e}")



--- [STEP 1/8] Starting: Data Collection ---


  raw_data = yf.download(TICKERS, start=START_DATE, end=END_DATE, progress=False)



--- [DATA REVIEW & VALIDATION] ---
Date Range: 2021-01-01 to 2025-09-04
Total Days (Rows) Collected: 1708

❌ CRITICAL FAILURE: Missing data detected after API call and forward-fill.
This indicates a problem with the source data. The script cannot proceed.
btc_price     0
btc_volume    0
eth_price     0
eth_volume    0
sol_price     0
sol_volume    0
vix_price     3
vix_volume    3
dtype: int64
--- [STEP 2/8] Starting: Call Volume Simulation ---
❌ FAILED: [Errno 2] No such file or directory: 'crypto_price_and_volume_data_2021_present.csv'
--- [STEP 3/8] Starting: Advanced Feature Engineering ---
❌ FAILED: [Errno 2] No such file or directory: 'crypto_price_and_volume_data_2021_present.csv'
--- [STEP 4/8] Starting: Exploratory Data Analysis ---
❌ FAILED: [Errno 2] No such file or directory: 'advanced_feature_training_data_2021_present.csv'
--- [STEP 5/8] Starting: Automated Model Tuning (this may take several minutes) ---
❌ FAILED: [Errno 2] No such file or directory: 'advanced_feature_t

  raw_forecast_data = yf.download(TICKERS, start=forecast_start_date, end=forecast_end_date, progress=False)


❌ FAILED: name 'X' is not defined
