# Save Trained Models for Streamlit App

This notebook trains and saves the ML models for use in the Streamlit web application.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import pickle
import os

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


## Load Data

In [2]:
print("="*70)
print("LOADING DATA")
print("="*70)

# Load training data
X_train = pd.read_csv('../data/model_input/X_train.csv')
y_train = pd.read_csv('../data/model_input/y_train.csv').iloc[:, 0]

print(f"\nTraining set: {len(X_train):,} samples")
print(f"Features: {list(X_train.columns)}")
print("\n✅ Data loaded!")

LOADING DATA

Training set: 1,982,775 samples
Features: ['T_years', 'moneyness', 'risk_free_rate']

✅ Data loaded!


## Train Random Forest

In [3]:
print("="*70)
print("TRAINING RANDOM FOREST")
print("="*70)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\nTraining...")
rf_model.fit(X_train, y_train)
print("\n✅ Random Forest trained!")

TRAINING RANDOM FOREST

Training...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.0s



✅ Random Forest trained!


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   55.1s finished


## Train XGBoost

In [4]:
print("="*70)
print("TRAINING XGBOOST")
print("="*70)

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

print("\nTraining...")
xgb_model.fit(X_train, y_train)
print("\n✅ XGBoost trained!")

TRAINING XGBOOST

Training...

✅ XGBoost trained!


## Save Models

In [5]:
print("="*70)
print("SAVING MODELS")
print("="*70)

# Create models directory
model_dir = '../models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"\nCreated directory: {model_dir}")

# Save Random Forest
rf_path = os.path.join(model_dir, 'rf_model.pkl')
with open(rf_path, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"\n✅ Random Forest saved to: {rf_path}")

# Save XGBoost
xgb_path = os.path.join(model_dir, 'xgb_model.pkl')
with open(xgb_path, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"✅ XGBoost saved to: {xgb_path}")

# Check file sizes
rf_size = os.path.getsize(rf_path) / (1024 * 1024)  # MB
xgb_size = os.path.getsize(xgb_path) / (1024 * 1024)  # MB

print(f"\nModel file sizes:")
print(f"  Random Forest: {rf_size:.2f} MB")
print(f"  XGBoost: {xgb_size:.2f} MB")

print("\n" + "="*70)
print("✅ MODELS SAVED SUCCESSFULLY!")
print("="*70)
print("\nYou can now run the Streamlit app with:")
print("  streamlit run app.py")

SAVING MODELS

Created directory: ../models

✅ Random Forest saved to: ../models/rf_model.pkl
✅ XGBoost saved to: ../models/xgb_model.pkl

Model file sizes:
  Random Forest: 724.99 MB
  XGBoost: 2.55 MB

✅ MODELS SAVED SUCCESSFULLY!

You can now run the Streamlit app with:
  streamlit run app.py


## Verify Models Load Correctly

In [6]:
print("="*70)
print("VERIFYING MODEL LOADING")
print("="*70)

# Try loading the models
try:
    with open(rf_path, 'rb') as f:
        rf_loaded = pickle.load(f)
    print("\n✅ Random Forest loaded successfully")
    
    with open(xgb_path, 'rb') as f:
        xgb_loaded = pickle.load(f)
    print("✅ XGBoost loaded successfully")
    
    # Test predictions
    test_input = pd.DataFrame({
        'T_years': [0.0833],  # 30 days
        'moneyness': [0.975],  # Slightly OTM
        'risk_free_rate': [0.02]
    })
    
    rf_pred = rf_loaded.predict(test_input)[0]
    xgb_pred = xgb_loaded.predict(test_input)[0]
    
    print(f"\nTest prediction (30 days, 97.5% moneyness, 2% rate):")
    print(f"  Random Forest IV: {rf_pred*100:.2f}%")
    print(f"  XGBoost IV: {xgb_pred*100:.2f}%")
    
    print("\n" + "="*70)
    print("✅ ALL TESTS PASSED!")
    print("="*70)
    
except Exception as e:
    print(f"\n❌ Error loading models: {e}")

VERIFYING MODEL LOADING

✅ Random Forest loaded successfully
✅ XGBoost loaded successfully

Test prediction (30 days, 97.5% moneyness, 2% rate):
  Random Forest IV: -55.32%
  XGBoost IV: -69.47%

✅ ALL TESTS PASSED!


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
