In [1]:
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import statistics
from sklearn.model_selection import KFold
import time
from utils import calculate_metric
import matplotlib.pyplot as plt
from pathlib import Path
import gpytorch
import os
import shap

# Import our Advanced GP implementation

from advanced_gaussian_process import AdvancedGPWithEnsemble

In [2]:
# Set random seeds
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x11fdaf050>

In [3]:
# Configuration
DATA_FOLDER = "../data"
TRAIN_FEATURES = DATA_FOLDER + '/train_features2.xlsx'
TRAIN_LABELS = DATA_FOLDER + "/train_labels2.xlsx"
TEST_FEATURES = DATA_FOLDER + "/test_features2.xlsx"
TEST_LABELS = DATA_FOLDER + "/test_labels2.xlsx"

MODEL_PATH = '../output/AdvancedGP'
PROJECT_NAME = "run/AdvancedGP"
TRAINING_OUTPUT_FILE = '../output/train_predictions.xlsx'
TEST_OUTPUT_FILE = '../output/test_predictions.xlsx'
SHEET_NAME = "AdvancedGP"
OUTPUT_FILE = MODEL_PATH + '/AdvancedGP.pkl'
FEATURE_IMPORTANCE_PATH = MODEL_PATH + '/feature_importance'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Create paths
output_dir = Path(MODEL_PATH)
output_dir.mkdir(parents=True, exist_ok=True)

In [5]:
# Load data
train_features = pd.read_excel(TRAIN_FEATURES)
train_labels = pd.read_excel(TRAIN_LABELS)
test_features = pd.read_excel(TEST_FEATURES)
test_labels = pd.read_excel(TEST_LABELS)

# Display sample of data
train_features.head()

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.5,-117.4602,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.0,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.0,119.85752,0.000678,-11.95038,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.25,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False


In [22]:
# Initialize model with advanced configuration (using AdvancedGPWithEnsemble without ensemble)
# Initialize the AdvancedGPWithEnsemble class with use_ensemble=False
# This will use a single AdvancedGaussianProcess model instead of an ensemble
# Import the necessary class from the module

# Get numerical features from the dataframe
numerical_features = train_features.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Initialize the model with appropriate parameters
model = AdvancedGPWithEnsemble(
    numerical_features=numerical_features,
    num_epochs=500,
    learning_rate=0.01,
    poly_degree=1,
    n_best_features=50,
    use_ensemble=True,
    using_deep_feature=False,
    using_feature_processor=False,
    n_models=100
)

# For backward compatibility with the rest of the notebook
# We'll extract the underlying AdvancedGaussianProcess model

In [11]:
train_features.head()

Unnamed: 0,coupon rate,SP500 MD,Average daily 1-year SP500 return,Ratio to MA,US Corporate Bond Yield Spread,US Corporate Bond Yield Spread(3-5 year),US Corporate Bond Yield Spread(5-7 year),US Corporate Bond Yield Spread(7-10 year),US Corporate Bond Yield Spread(10+ year),US Generic Govt 3 Month Yield,...,event_type_subcategory_sum_Missing Coupon payment only,event_type_subcategory_sum_Missing Interest payment,event_type_subcategory_sum_Missing Loan payment,event_type_subcategory_sum_Missing Principal payment,event_type_subcategory_sum_Others,event_type_subcategory_sum_Pre-Negotiated Chapter 11,event_type_subcategory_sum_Protection,event_type_subcategory_sum_Receivership,event_type_subcategory_sum_Rehabilitation,event_type_subcategory_sum_Restructuring
0,7.5,-117.4602,-0.000189,125.407139,177.213028,134.012054,198.8153,191.364395,223.346344,0.1983,...,True,False,False,False,False,False,False,False,False,False
1,6.0,166.38276,0.000768,-4.603446,101.613617,77.032829,123.3998,105.932022,139.111115,0.0355,...,False,False,False,False,False,False,False,False,False,False
2,11.0,119.85752,0.000678,-11.95038,104.545959,77.416649,129.4317,111.818001,139.717407,0.0101,...,False,False,False,False,False,False,False,False,False,False
3,9.125,653.51208,0.001638,-2.494861,90.736633,64.654129,95.3731,92.141212,121.666237,0.0152,...,False,False,False,False,False,False,False,False,False,False
4,9.25,231.89472,0.000664,4.823413,98.533821,68.759308,93.4174,107.424469,139.741165,1.2865,...,False,False,False,False,True,False,False,False,False,False


In [23]:
# Train the model
print("Training model...")
start_time = time.time()

model.fit(train_features, train_labels['rr1_30'].values)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

Training model...

Training model 1/100
Epoch 1/500 - Loss: 1.510
Epoch 51/500 - Loss: 1.160
Epoch 101/500 - Loss: 0.983
Epoch 151/500 - Loss: 0.882
Epoch 201/500 - Loss: 0.862
Epoch 251/500 - Loss: 0.752
Epoch 301/500 - Loss: 0.777
Epoch 351/500 - Loss: 0.581
Epoch 401/500 - Loss: 0.657
Epoch 451/500 - Loss: 0.638

Training model 2/100
Epoch 1/500 - Loss: 1.510
Epoch 51/500 - Loss: 1.157
Epoch 101/500 - Loss: 0.930
Epoch 151/500 - Loss: 0.867
Epoch 201/500 - Loss: 0.741
Epoch 251/500 - Loss: 0.675
Epoch 301/500 - Loss: 0.610
Epoch 351/500 - Loss: 0.551
Epoch 401/500 - Loss: 0.515
Epoch 451/500 - Loss: 0.639

Training model 3/100
Epoch 1/500 - Loss: 1.508
Epoch 51/500 - Loss: 1.115
Epoch 101/500 - Loss: 0.904
Epoch 151/500 - Loss: 0.904
Epoch 201/500 - Loss: 0.783
Epoch 251/500 - Loss: 0.782
Epoch 301/500 - Loss: 0.779
Epoch 351/500 - Loss: 0.686
Epoch 401/500 - Loss: 0.628
Epoch 451/500 - Loss: 0.687

Training model 4/100
Epoch 1/500 - Loss: 1.512
Epoch 51/500 - Loss: 1.157
Epoch 101/

In [25]:
# Make predictions
print("Making predictions...")
train_pred, train_std = model.predict(train_features, return_std=True)
test_pred, test_std = model.predict(test_features, return_std=True)

# Calculate metrics
print("Training Metrics:")

mae, mape, rmse, rsqr = calculate_metric(train_labels.values.ravel(), train_pred.ravel())
print(f"Train average mean absolute error: {mae}")
print(f"Train average mean absolute percentage error: {mape}")
print(f"Train average root mean squared error: {rmse}")
print(f"Train average R2: {rsqr}")

print("\nTest Metrics:")
mae, mape, rmse, rsqr = calculate_metric(test_labels.values.ravel(), test_pred.ravel())
print(f"Val average mean absolute error: {mae}")
print(f"Val average mean absolute percentage error: {mape}")
print(f"Val average root mean squared error: {rmse}")
print(f"Val average R2: {rsqr}")

Making predictions...
Training Metrics:
Train average mean absolute error: 0.07167906134169158
Train average mean absolute percentage error: 19.009193745176045
Train average root mean squared error: 0.10661564330970903
Train average R2: 0.8514694491621588

Test Metrics:
Val average mean absolute error: 0.15039265648806513
Val average mean absolute percentage error: 39.94953426288143
Val average root mean squared error: 0.22211521208058846
Val average R2: 0.23387584643692105


In [None]:
# Save predictions
train_df = pd.DataFrame({
    'Actual': train_labels['rr1_30'],
    'Predicted': train_pred,
    'Std': train_std
})
train_df.to_excel(TRAINING_OUTPUT_FILE, sheet_name=SHEET_NAME, index=False)

test_df = pd.DataFrame({
    'Actual': test_labels['rr1_30'],
    'Predicted': test_pred,
    'Std': test_std
})
test_df.to_excel(TEST_OUTPUT_FILE, sheet_name=SHEET_NAME, index=False)

In [None]:
# Analyze and plot feature importance
feature_names = train_features.columns.tolist()
importance_df = model.plot_feature_importances(
    feature_names=feature_names,
    top_k=20,
    save_path=FEATURE_IMPORTANCE_PATH
)

print("\nTop 10 Most Important Features:")
print(importance_df[['Feature', 'Combined_Importance']].head(10))

In [None]:
# Plot actual vs predicted values with uncertainty
plt.figure(figsize=(15, 5))

# Training data
plt.subplot(1, 2, 1)
plt.errorbar(train_labels['rr1_30'], train_pred, 
            yerr=2*train_std, fmt='o', alpha=0.5, 
            markersize=2, label='95% CI')
plt.plot([0, 1], [0, 1], 'r--', label='Perfect Prediction')
plt.xlabel('Actual Recovery Rate')
plt.ylabel('Predicted Recovery Rate')
plt.title('Training Set Predictions')
plt.legend()

# Test data
plt.subplot(1, 2, 2)
plt.errorbar(test_labels['rr1_30'], test_pred, 
            yerr=2*test_std, fmt='o', alpha=0.5, 
            markersize=2, label='95% CI')
plt.plot([0, 1], [0, 1], 'r--', label='Perfect Prediction')
plt.xlabel('Actual Recovery Rate')
plt.ylabel('Predicted Recovery Rate')
plt.title('Test Set Predictions')
plt.legend()

plt.tight_layout()
plt.savefig(f"{MODEL_PATH}/predictions_with_uncertainty.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Analyze uncertainty distribution
plt.figure(figsize=(15, 5))

# Training uncertainty
plt.subplot(1, 2, 1)
plt.hist(train_std, bins=50, alpha=0.7)
plt.xlabel('Standard Deviation')
plt.ylabel('Count')
plt.title('Training Set Uncertainty Distribution')

# Test uncertainty
plt.subplot(1, 2, 2)
plt.hist(test_std, bins=50, alpha=0.7)
plt.xlabel('Standard Deviation')
plt.ylabel('Count')
plt.title('Test Set Uncertainty Distribution')

plt.tight_layout()
plt.savefig(f"{MODEL_PATH}/uncertainty_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

# Print uncertainty statistics
print("\nUncertainty Statistics:")
print(f"Training Set - Mean Std: {np.mean(train_std):.4f}, Median Std: {np.median(train_std):.4f}")
print(f"Test Set - Mean Std: {np.mean(test_std):.4f}, Median Std: {np.median(test_std):.4f}")