# 07 - XGBoost Comprehensive Comparison: Baseline vs OCEAN Methods

**Purpose**: Compare XGBoost performance with different OCEAN feature extraction methods

## Models to Compare:

1. **Baseline**: XGBoost without OCEAN features (36 base features)
2. **Ridge-Weighted OCEAN**: Ridge regression mapping from base features to OCEAN (R2 0.15-0.20)
3. **BGE + ElasticNet OCEAN**: BGE embeddings + ElasticNet regression (R2 0.127)

## Evaluation Metrics:

- **Primary**: ROC-AUC (main metric for loan default prediction)
- **Secondary**: Precision, Recall, F1-Score
- **Feature Analysis**: OCEAN feature importance (XGBoost gain, SHAP values)
- **Statistical**: McNemar's test, Bootstrap confidence intervals

## Expected Outcomes:

Based on methodology:
- BGE ElasticNet R2 0.127 → Expected AUC improvement +0.010 ~ +0.025
- If AUC improvement < 0.01: OCEAN not useful for loan default prediction
- If AUC improvement > 0.02: BGE ElasticNet OCEAN recommended for production

**Estimated Time**: 30-60 minutes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve
)

# Statistical tests
from statsmodels.stats.contingency_tables import mcnemar
from scipy import stats

# Set random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
plt.style.use('seaborn-v0_8-darkgrid')

print("Libraries loaded successfully")
print(f"Timestamp: {datetime.now()}")

## Step 1: Configuration

In [None]:
# Configuration
CONFIG = {
    # Data files
    'baseline_data': '../data/loan_clean_for_modeling.csv',
    'bge_ocean_data': '../loan_with_bge_elasticnet_ocean.csv',
    
    # Previous results (Ridge-Weighted OCEAN)
    'ridge_results': '../results/xgboost_comparison_results.json',
    
    # Output files
    'output_comparison': '../xgboost_comprehensive_comparison.csv',
    'output_visualization': '../xgboost_comprehensive_evaluation.png',
    'output_report': '../xgboost_comparison_report.json',
    
    # OCEAN features
    'ocean_dims': ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'],
    'bge_ocean_prefix': 'bge_elasticnet_',  # Prefix for BGE ElasticNet columns
    
    # Model parameters
    'xgboost_params': {
        'n_estimators': 100,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': RANDOM_STATE,
        'eval_metric': 'logloss',
        'early_stopping_rounds': 10
    },
    
    # Train/test split
    'test_size': 0.2,
    'random_state': RANDOM_STATE,
    
    # Features to remove (high cardinality)
    'remove_features': ['emp_title', 'title', 'earliest_cr_line', 'desc']
}

print("Configuration:")
for key, value in CONFIG.items():
    if key not in ['xgboost_params', 'ocean_dims', 'remove_features']:
        print(f"  {key}: {value}")

print(f"\nXGBoost parameters: {CONFIG['xgboost_params']}")

## Step 2: Load Data and Prepare Features

We need to align the three datasets:
1. Baseline data (all samples)
2. BGE OCEAN data (subset with desc >= 50 chars)

Strategy: Use intersection of samples that have BGE OCEAN features

In [None]:
# Load baseline data
print("Loading baseline data...")
df_baseline = pd.read_csv(CONFIG['baseline_data'], low_memory=False)
print(f"  Rows: {len(df_baseline):,}")
print(f"  Columns: {len(df_baseline.columns)}")

# Load BGE OCEAN data
print("\nLoading BGE + ElasticNet OCEAN data...")
df_bge_ocean = pd.read_csv(CONFIG['bge_ocean_data'], low_memory=False)
print(f"  Rows: {len(df_bge_ocean):,}")
print(f"  Columns: {len(df_bge_ocean.columns)}")

# Verify BGE OCEAN columns exist
bge_ocean_cols = [f"{CONFIG['bge_ocean_prefix']}{dim}" for dim in CONFIG['ocean_dims']]
missing_cols = [col for col in bge_ocean_cols if col not in df_bge_ocean.columns]

if missing_cols:
    raise ValueError(f"Missing BGE OCEAN columns: {missing_cols}")

print(f"\nBGE OCEAN columns found: {bge_ocean_cols}")

# Check target column
if 'target' not in df_bge_ocean.columns:
    raise ValueError("Missing target column!")

print(f"\nTarget distribution (BGE data):")
print(df_bge_ocean['target'].value_counts())
print(f"Default rate: {df_bge_ocean['target'].mean()*100:.2f}%")

## Step 3: Prepare Three Feature Sets

1. **Baseline Features**: Original features only (no OCEAN)
2. **Ridge OCEAN Features**: Original features + Ridge-Weighted OCEAN (if available in data)
3. **BGE OCEAN Features**: Original features + BGE ElasticNet OCEAN

In [None]:
# Use BGE OCEAN data as base (since it has all features)
df = df_bge_ocean.copy()

# Separate features and target
y = df['target']
X_all = df.drop(columns=['target'], errors='ignore')

# Remove high cardinality features
X_all = X_all.drop(columns=CONFIG['remove_features'], errors='ignore')

print(f"Full feature set shape: {X_all.shape}")

# Prepare three feature sets
# 1. Baseline: Remove ALL OCEAN features
ocean_related_cols = [col for col in X_all.columns if any(dim in col.lower() for dim in CONFIG['ocean_dims'])]
X_baseline = X_all.drop(columns=ocean_related_cols, errors='ignore')

print(f"\n1. Baseline features: {X_baseline.shape[1]} features")
print(f"   Removed {len(ocean_related_cols)} OCEAN-related columns")

# 2. Ridge OCEAN: Original features + old OCEAN columns (if they exist)
ridge_ocean_cols = [col for col in X_all.columns 
                    if col in CONFIG['ocean_dims']]  # Old columns without prefix

if ridge_ocean_cols:
    X_ridge = X_baseline.copy()
    for col in ridge_ocean_cols:
        X_ridge[col] = X_all[col]
    print(f"\n2. Ridge OCEAN features: {X_ridge.shape[1]} features")
    print(f"   Added {len(ridge_ocean_cols)} Ridge OCEAN columns: {ridge_ocean_cols}")
    has_ridge = True
else:
    print(f"\n2. Ridge OCEAN features: NOT FOUND in dataset")
    print(f"   Will use results from previous run: {CONFIG['ridge_results']}")
    X_ridge = None
    has_ridge = False

# 3. BGE OCEAN: Original features + BGE ElasticNet OCEAN
X_bge = X_baseline.copy()
for col in bge_ocean_cols:
    X_bge[col] = X_all[col]

print(f"\n3. BGE ElasticNet OCEAN features: {X_bge.shape[1]} features")
print(f"   Added {len(bge_ocean_cols)} BGE OCEAN columns: {bge_ocean_cols}")

# Verify all feature sets have same number of samples
print(f"\nSample counts:")
print(f"  Baseline: {len(X_baseline)}")
if has_ridge:
    print(f"  Ridge OCEAN: {len(X_ridge)}")
print(f"  BGE OCEAN: {len(X_bge)}")
print(f"  Target: {len(y)}")

## Step 4: Train/Test Split (Consistent Across All Models)

In [None]:
# Single train/test split to ensure fair comparison
print("Performing train/test split (80/20)...\n")

# Get train/test indices
indices = np.arange(len(y))
train_idx, test_idx = train_test_split(
    indices,
    test_size=CONFIG['test_size'],
    random_state=CONFIG['random_state'],
    stratify=y
)

# Split baseline
X_baseline_train = X_baseline.iloc[train_idx]
X_baseline_test = X_baseline.iloc[test_idx]

# Split Ridge (if available)
if has_ridge:
    X_ridge_train = X_ridge.iloc[train_idx]
    X_ridge_test = X_ridge.iloc[test_idx]

# Split BGE
X_bge_train = X_bge.iloc[train_idx]
X_bge_test = X_bge.iloc[test_idx]

# Split target
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]

print(f"Training set: {len(y_train):,} samples ({len(y_train)/len(y)*100:.1f}%)")
print(f"Test set: {len(y_test):,} samples ({len(y_test)/len(y)*100:.1f}%)")

print(f"\nTrain default rate: {y_train.mean()*100:.2f}%")
print(f"Test default rate: {y_test.mean()*100:.2f}%")

print(f"\nTrain class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

## Step 5: Create Preprocessing Pipelines

In [None]:
def create_preprocessor(X):
    """
    Create preprocessing pipeline for given feature set.
    
    Args:
        X: Feature dataframe
    
    Returns:
        ColumnTransformer: Preprocessing pipeline
    """
    # Identify feature types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    # Numeric preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical preprocessing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combined preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor, numeric_features, categorical_features


# Create preprocessors for each feature set
print("Creating preprocessing pipelines...\n")

# Baseline
preprocessor_baseline, num_feats_baseline, cat_feats_baseline = create_preprocessor(X_baseline_train)
print(f"Baseline preprocessor:")
print(f"  Numeric features: {len(num_feats_baseline)}")
print(f"  Categorical features: {len(cat_feats_baseline)}")

# Ridge (if available)
if has_ridge:
    preprocessor_ridge, num_feats_ridge, cat_feats_ridge = create_preprocessor(X_ridge_train)
    print(f"\nRidge OCEAN preprocessor:")
    print(f"  Numeric features: {len(num_feats_ridge)}")
    print(f"  Categorical features: {len(cat_feats_ridge)}")

# BGE
preprocessor_bge, num_feats_bge, cat_feats_bge = create_preprocessor(X_bge_train)
print(f"\nBGE OCEAN preprocessor:")
print(f"  Numeric features: {len(num_feats_bge)}")
print(f"  Categorical features: {len(cat_feats_bge)}")

print(f"\nPreprocessing pipelines created!")

## Step 6: Preprocess Data

In [None]:
print("Preprocessing data...\n")

# Baseline
print("Preprocessing Baseline...")
X_baseline_train_processed = preprocessor_baseline.fit_transform(X_baseline_train)
X_baseline_test_processed = preprocessor_baseline.transform(X_baseline_test)
print(f"  Train shape: {X_baseline_train_processed.shape}")
print(f"  Test shape: {X_baseline_test_processed.shape}")

# Ridge (if available)
if has_ridge:
    print("\nPreprocessing Ridge OCEAN...")
    X_ridge_train_processed = preprocessor_ridge.fit_transform(X_ridge_train)
    X_ridge_test_processed = preprocessor_ridge.transform(X_ridge_test)
    print(f"  Train shape: {X_ridge_train_processed.shape}")
    print(f"  Test shape: {X_ridge_test_processed.shape}")

# BGE
print("\nPreprocessing BGE OCEAN...")
X_bge_train_processed = preprocessor_bge.fit_transform(X_bge_train)
X_bge_test_processed = preprocessor_bge.transform(X_bge_test)
print(f"  Train shape: {X_bge_train_processed.shape}")
print(f"  Test shape: {X_bge_test_processed.shape}")

print("\nData preprocessing complete!")

## Step 7: Train XGBoost Models

In [None]:
# Calculate class weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class weight (scale_pos_weight): {scale_pos_weight:.2f}\n")

# Update XGBoost parameters with class weight
xgb_params = CONFIG['xgboost_params'].copy()
xgb_params['scale_pos_weight'] = scale_pos_weight

# Storage for models and results
models = {}
results = {}

def train_and_evaluate(name, X_train, X_test, y_train, y_test, params):
    """
    Train XGBoost model and evaluate performance.
    
    Returns:
        dict: Model, predictions, and metrics
    """
    print(f"Training {name}...")
    
    # Create model
    model = xgb.XGBClassifier(**params)
    
    # Train
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"  AUC: {auc:.4f}")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1: {f1:.4f}")
    
    return {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc
        },
        'confusion_matrix': cm
    }


print("="*80)
print("Training XGBoost Models")
print("="*80 + "\n")

# Train Baseline
results['baseline'] = train_and_evaluate(
    'Baseline (no OCEAN)',
    X_baseline_train_processed,
    X_baseline_test_processed,
    y_train,
    y_test,
    xgb_params
)

# Train Ridge (if available)
if has_ridge:
    print()
    results['ridge'] = train_and_evaluate(
        'Ridge-Weighted OCEAN',
        X_ridge_train_processed,
        X_ridge_test_processed,
        y_train,
        y_test,
        xgb_params
    )
else:
    # Load previous Ridge results
    print("\nLoading previous Ridge OCEAN results...")
    try:
        with open(CONFIG['ridge_results'], 'r') as f:
            ridge_data = json.load(f)
        
        results['ridge'] = {
            'model': None,
            'y_pred': None,
            'y_pred_proba': None,
            'metrics': {
                'accuracy': ridge_data['ocean']['test_acc'],
                'precision': ridge_data['ocean']['precision'],
                'recall': ridge_data['ocean']['recall'],
                'f1': ridge_data['ocean']['f1'],
                'auc': ridge_data['ocean']['auc']
            },
            'confusion_matrix': None
        }
        
        print(f"  AUC (from previous run): {results['ridge']['metrics']['auc']:.4f}")
        print(f"  Note: Using metrics from previous experiment")
        
    except Exception as e:
        print(f"  Warning: Could not load Ridge results: {e}")
        results['ridge'] = None

# Train BGE
print()
results['bge'] = train_and_evaluate(
    'BGE + ElasticNet OCEAN',
    X_bge_train_processed,
    X_bge_test_processed,
    y_train,
    y_test,
    xgb_params
)

print("\n" + "="*80)
print("Model Training Complete")
print("="*80)