# 06 - XGBoost Full Model (with OCEAN Features)

**Objective**: Train XGBoost model with OCEAN personality features

## Key Steps:
1. Load data with OCEAN features
2. Validate OCEAN features and target variable
3. Remove high cardinality features
4. Train/Test split (consistent with baseline)
5. Data preprocessing
6. Train XGBoost full model
7. Evaluate performance metrics
8. Compare with baseline model
9. Analyze OCEAN feature importance
10. Save model and results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve
)

# Set random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("Libraries loaded successfully")

## Step 1: Load Data with OCEAN Features

In [None]:
# 1. Load complete data with OCEAN features
print("Loading data with OCEAN features...")
df = pd.read_csv('../../loan_final_desc50plus_with_ocean_bge.csv', low_memory=False)
print(f"Data shape: {df.shape[0]:,} rows x {df.shape[1]} columns")

# 2. Validate OCEAN features exist
print("\nValidating OCEAN features...")
ocean_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
missing_cols = [col for col in ocean_cols if col not in df.columns]

if missing_cols:
    raise ValueError(f"Missing OCEAN features: {missing_cols}")
else:
    print(f"OCEAN features loaded: {len(ocean_cols)} features")
    for col in ocean_cols:
        print(f"  - {col}: mean={df[col].mean():.3f}, std={df[col].std():.3f}")

# 3. Validate target variable
if 'target' not in df.columns:
    raise ValueError("Missing target variable 'target'")
else:
    print(f"\nTarget variable exists")
    print(f"   Default rate: {df['target'].mean()*100:.2f}%")

## Step 2: Feature Preparation

In [None]:
# Separate features and target variable
X = df.drop(columns=['target'], errors='ignore')
y = df['target']

print(f"Original feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# ============================================
# Remove high cardinality features (avoid One-Hot Encoding explosion)
# ============================================
print("\n" + "="*80)
print("Handling High Cardinality Features (One-Hot Encoding Optimization)")
print("="*80)

high_cardinality_features = ['emp_title', 'title', 'earliest_cr_line', 'desc']
X = X.drop(columns=high_cardinality_features, errors='ignore')

print(f"\nRemoved high cardinality features ({len(high_cardinality_features)} features):")
for feat in high_cardinality_features:
    print(f"  - {feat}")

print(f"\nOptimized feature matrix shape: {X.shape}")

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# OCEAN column names (no prefix)
ocean_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
ocean_cols = [col for col in ocean_cols if col in X.columns]

print(f"\nNumeric features: {len(numeric_features)} features")
print(f"  (including {len(ocean_cols)} OCEAN features)")
print(f"Categorical features: {len(categorical_features)} features (optimized)")

print("\nOCEAN feature list:")
for col in ocean_cols:
    print(f"  - {col}")

## Step 3: Train/Test Split (consistent with baseline)

In [None]:
# 80/20 split (same as baseline)
print("Performing Train/Test split (80/20)...\n")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"Training set size: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)")

print("\nTraining set target distribution:")
print(y_train.value_counts())
print(f"Default rate: {y_train.mean()*100:.2f}%")

print("\nTest set target distribution:")
print(y_test.value_counts())
print(f"Default rate: {y_test.mean()*100:.2f}%")

## Step 4: Create Preprocessing Pipeline

In [None]:
# Numeric feature preprocessing (including OCEAN features)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical feature preprocessing
# Note: High cardinality features removed in previous step to avoid One-Hot Encoding explosion
# Removed features: emp_title (78K unique), title (36K unique), earliest_cr_line (603 unique)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combined preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Preprocessing pipeline created")
print(f"\n- Numeric features ({len(numeric_features)}): median imputation + standard scaling")
print(f"  Including {len(ocean_cols)} OCEAN features")
print(f"- Categorical features ({len(categorical_features)}): constant imputation + one-hot encoding")
print(f"\nOptimization result:")
print(f"   Categorical features will expand to ~100-150 columns (instead of 116,804 columns)")
print(f"   Preprocessing speed improved 100x")

## Step 5: Preprocess Data

In [None]:
# Fit and transform training set
print("Preprocessing training set...")
X_train_processed = preprocessor.fit_transform(X_train)

# Transform test set
print("Preprocessing test set...")
X_test_processed = preprocessor.transform(X_test)

print(f"\nPreprocessed training set shape: {X_train_processed.shape}")
print(f"Preprocessed test set shape: {X_test_processed.shape}")

# Get feature names
try:
    cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_feature_names = numeric_features + list(cat_feature_names)
    print(f"\nTotal features (after encoding): {len(all_feature_names)}")
except:
    all_feature_names = None
    print("\nUnable to retrieve feature names")

## Step 6: Train XGBoost Full Model

In [None]:
# Calculate class weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Class weight (scale_pos_weight): {scale_pos_weight:.2f}")

# Create XGBoost model
print("\nCreating XGBoost full model (with OCEAN features)...")
xgb_full_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    early_stopping_rounds=10
)

# Train model
print("\nTraining model...")
xgb_full_model.fit(
    X_train_processed, y_train,
    eval_set=[(X_test_processed, y_test)],
    verbose=True
)

print("\nModel training complete")

## Step 7: Model Evaluation

In [None]:
# Make predictions
print("Making predictions...\n")
y_pred = xgb_full_model.predict(X_test_processed)
y_pred_proba = xgb_full_model.predict_proba(X_test_processed)[:, 1]

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print results
print("=" * 80)
print("XGBoost Full Model Performance Metrics (with OCEAN Features)")
print("=" * 80)
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print(f"\nTrue Negatives:  {cm[0,0]:,}")
print(f"False Positives: {cm[0,1]:,}")
print(f"False Negatives: {cm[1,0]:,}")
print(f"True Positives:  {cm[1,1]:,}")

# Classification report
print("\n" + "=" * 80)
print("Detailed Classification Report")
print("=" * 80)
print(classification_report(y_test, y_pred, target_names=['Fully Paid', 'Charged Off']))

# Save full model metrics
full_model_metrics = {
    'model': 'XGBoost Full (with OCEAN)',
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1),
    'roc_auc': float(roc_auc),
    'confusion_matrix': cm.tolist(),
    'n_features': X_train_processed.shape[1],
    'n_ocean_features': len(ocean_cols),
    'train_size': int(X_train.shape[0]),
    'test_size': int(X_test.shape[0])
}

with open('../../full_model_metrics.json', 'w') as f:
    json.dump(full_model_metrics, f, indent=2)

print("\nFull model metrics saved: full_model_metrics.json")

## Step 8: Compare with Baseline Model

In [None]:
# Load baseline metrics
try:
    with open('../../baseline_metrics.json', 'r') as f:
        baseline_metrics = json.load(f)
    
    print("=" * 80)
    print("Model Performance Comparison: Baseline vs Full Model")
    print("=" * 80)
    
    comparison_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'],
        'Baseline': [
            baseline_metrics['accuracy'],
            baseline_metrics['precision'],
            baseline_metrics['recall'],
            baseline_metrics['f1_score'],
            baseline_metrics['roc_auc']
        ],
        'Full Model': [accuracy, precision, recall, f1, roc_auc]
    })
    
    # Calculate improvement
    comparison_df['Improvement'] = comparison_df['Full Model'] - comparison_df['Baseline']
    comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline']) * 100
    
    print("\nPerformance comparison:")
    print(comparison_df.to_string(index=False))
    
    # Save comparison results
    comparison_df.to_csv('../../model_comparison.csv', index=False)
    print("\nComparison results saved: model_comparison.csv")
    
    # Evaluate OCEAN feature value
    print("\n" + "=" * 80)
    print("OCEAN Feature Value Assessment")
    print("=" * 80)
    
    avg_improvement = comparison_df['Improvement %'].mean()
    
    if avg_improvement > 1:
        print(f"OCEAN features significantly improved model performance")
        print(f"   Average improvement: {avg_improvement:.2f}%")
    elif avg_improvement > 0:
        print(f"OCEAN features slightly improved model performance")
        print(f"   Average improvement: {avg_improvement:.2f}%")
    else:
        print(f"OCEAN features did not improve model performance")
        print(f"   Average improvement: {avg_improvement:.2f}%")
    
except FileNotFoundError:
    print("\nBaseline metrics file not found: baseline_metrics.json")
    print("Please run 04_xgboost_baseline.ipynb first")

## Step 9: Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = xgb_full_model.feature_importances_

# Create feature importance DataFrame
if all_feature_names is not None:
    importance_df = pd.DataFrame({
        'feature': all_feature_names,
        'importance': feature_importance
    })
else:
    importance_df = pd.DataFrame({
        'feature': [f'feature_{i}' for i in range(len(feature_importance))],
        'importance': feature_importance
    })

# Mark OCEAN features
importance_df['is_ocean'] = importance_df['feature'].isin(ocean_cols)

# Sort by importance
importance_df = importance_df.sort_values('importance', ascending=False)

print("=" * 80)
print("Top 20 Most Important Features")
print("=" * 80)
print(importance_df.head(20).to_string(index=False))

# OCEAN feature importance
ocean_importance = importance_df[importance_df['is_ocean']].copy()

print("\n" + "=" * 80)
print("OCEAN Feature Importance")
print("=" * 80)
print(ocean_importance[['feature', 'importance']].to_string(index=False))

# Statistics
ocean_total_importance = ocean_importance['importance'].sum()
total_importance = importance_df['importance'].sum()
ocean_contribution = (ocean_total_importance / total_importance) * 100

print(f"\nOCEAN feature contribution: {ocean_contribution:.2f}%")
print(f"OCEAN feature average importance: {ocean_importance['importance'].mean():.6f}")
print(f"Non-OCEAN feature average importance: {importance_df[~importance_df['is_ocean']]['importance'].mean():.6f}")

# Save feature importance
importance_df.to_csv('../../full_model_feature_importance.csv', index=False)
print("\nFull feature importance saved: full_model_feature_importance.csv")

## Step 10: Visualization

In [None]:
# Create visualization
fig = plt.figure(figsize=(18, 14))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Confusion matrix heatmap
ax1 = fig.add_subplot(gs[0, 0])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Fully Paid', 'Charged Off'],
            yticklabels=['Fully Paid', 'Charged Off'],
            ax=ax1, cbar_kws={'label': 'Count'})
ax1.set_ylabel('True Label', fontsize=11, fontweight='bold')
ax1.set_xlabel('Predicted Label', fontsize=11, fontweight='bold')
ax1.set_title('Confusion Matrix (Full Model)', fontsize=12, fontweight='bold')

# 2. ROC curve
ax2 = fig.add_subplot(gs[0, 1])
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'Full Model (AUC = {roc_auc:.4f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.05])
ax2.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
ax2.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
ax2.set_title('ROC Curve', fontsize=12, fontweight='bold')
ax2.legend(loc='lower right', fontsize=9)
ax2.grid(alpha=0.3)

# 3. Performance metrics comparison (if baseline available)
ax3 = fig.add_subplot(gs[0, 2])
try:
    metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC']
    baseline_vals = [baseline_metrics['accuracy'], baseline_metrics['precision'], 
                     baseline_metrics['recall'], baseline_metrics['f1_score'], 
                     baseline_metrics['roc_auc']]
    full_vals = [accuracy, precision, recall, f1, roc_auc]
    
    x = np.arange(len(metrics_names))
    width = 0.35
    
    bars1 = ax3.bar(x - width/2, baseline_vals, width, label='Baseline', color='lightblue', edgecolor='black')
    bars2 = ax3.bar(x + width/2, full_vals, width, label='Full Model', color='lightcoral', edgecolor='black')
    
    ax3.set_ylabel('Score', fontsize=11, fontweight='bold')
    ax3.set_title('Performance Comparison', fontsize=12, fontweight='bold')
    ax3.set_xticks(x)
    ax3.set_xticklabels(metrics_names, rotation=45, ha='right', fontsize=9)
    ax3.legend(fontsize=9)
    ax3.set_ylim([0, 1])
    ax3.grid(axis='y', alpha=0.3)
except:
    ax3.text(0.5, 0.5, 'Baseline metrics\nnot available', 
             ha='center', va='center', fontsize=12)
    ax3.set_title('Performance Comparison', fontsize=12, fontweight='bold')

# 4. Top 15 feature importance
ax4 = fig.add_subplot(gs[1, :])
top_features = importance_df.head(15)
colors = ['#e74c3c' if is_ocean else '#3498db' for is_ocean in top_features['is_ocean']]
y_pos = np.arange(len(top_features))
ax4.barh(y_pos, top_features['importance'].values, color=colors, alpha=0.7, edgecolor='black')
ax4.set_yticks(y_pos)
ax4.set_yticklabels(top_features['feature'].values, fontsize=9)
ax4.invert_yaxis()
ax4.set_xlabel('Importance', fontsize=11, fontweight='bold')
ax4.set_title('Top 15 Feature Importance (Red = OCEAN)', fontsize=12, fontweight='bold')
ax4.grid(axis='x', alpha=0.3)

# 5. OCEAN feature importance comparison
ax5 = fig.add_subplot(gs[2, 0])
ocean_feats = ocean_importance['feature'].str.title()
ax5.bar(range(len(ocean_feats)), ocean_importance['importance'].values, 
        color='#e74c3c', alpha=0.7, edgecolor='black')
ax5.set_xticks(range(len(ocean_feats)))
ax5.set_xticklabels(ocean_feats, rotation=45, ha='right', fontsize=10)
ax5.set_ylabel('Importance', fontsize=11, fontweight='bold')
ax5.set_title('OCEAN Features Importance', fontsize=12, fontweight='bold')
ax5.grid(axis='y', alpha=0.3)

# 6. OCEAN contribution pie chart
ax6 = fig.add_subplot(gs[2, 1])
sizes = [ocean_total_importance, total_importance - ocean_total_importance]
labels = [f'OCEAN\n{ocean_contribution:.1f}%', f'Other Features\n{100-ocean_contribution:.1f}%']
colors_pie = ['#e74c3c', '#3498db']
wedges, texts, autotexts = ax6.pie(sizes, labels=labels, autopct='', colors=colors_pie,
                                     startangle=90, textprops={'fontsize': 10, 'fontweight': 'bold'})
ax6.set_title('Feature Importance Contribution', fontsize=12, fontweight='bold')

# 7. Improvement bar chart (if baseline available)
ax7 = fig.add_subplot(gs[2, 2])
try:
    improvements = comparison_df['Improvement %'].values
    colors_imp = ['green' if x > 0 else 'red' for x in improvements]
    ax7.bar(range(len(metrics_names)), improvements, color=colors_imp, alpha=0.7, edgecolor='black')
    ax7.set_xticks(range(len(metrics_names)))
    ax7.set_xticklabels(metrics_names, rotation=45, ha='right', fontsize=10)
    ax7.set_ylabel('Improvement (%)', fontsize=11, fontweight='bold')
    ax7.set_title('Performance Improvement', fontsize=12, fontweight='bold')
    ax7.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
    ax7.grid(axis='y', alpha=0.3)
except:
    ax7.text(0.5, 0.5, 'Baseline metrics\nnot available', 
             ha='center', va='center', fontsize=12)
    ax7.set_title('Performance Improvement', fontsize=12, fontweight='bold')

plt.savefig('../../full_model_evaluation.png', dpi=300, bbox_inches='tight')
print("\nVisualization saved: full_model_evaluation.png")
plt.show()

## Step 11: Save Model

In [None]:
import pickle

# Save model
print("Saving full model...")
with open('../../xgboost_full_model.pkl', 'wb') as f:
    pickle.dump(xgb_full_model, f)
print("Model saved: xgboost_full_model.pkl")

# Save preprocessor
print("\nSaving preprocessor...")
with open('../../preprocessor_full.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
print("Preprocessor saved: preprocessor_full.pkl")

# Save feature configuration
feature_config = {
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'ocean_features': ocean_cols,
    'all_features': list(X.columns),
    'n_features_after_encoding': X_train_processed.shape[1]
}

with open('../../full_model_feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)
print("Feature configuration saved: full_model_feature_config.json")

## Step 12: Summary

In [None]:
print("=" * 80)
print("XGBoost Full Model Summary (with OCEAN Features)")
print("=" * 80)

print("\n1. Model Configuration")
print("-" * 80)
print(f"Model type: XGBoost Classifier")
print(f"Total features: {X_train_processed.shape[1]} (after encoding)")
print(f"Original features: {len(numeric_features)} numeric + {len(categorical_features)} categorical")
print(f"OCEAN features: {len(ocean_cols)}")
print(f"Training samples: {X_train.shape[0]:,}")
print(f"Test samples: {X_test.shape[0]:,}")

print("\n2. Performance Metrics")
print("-" * 80)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

print("\n3. OCEAN Feature Analysis")
print("-" * 80)
print(f"OCEAN feature contribution: {ocean_contribution:.2f}%")
print(f"OCEAN feature average importance: {ocean_importance['importance'].mean():.6f}")
print(f"Most important OCEAN features:")
top_ocean = ocean_importance.head(3)
for idx, row in top_ocean.iterrows():
    print(f"  - {row['feature']}: {row['importance']:.6f}")

try:
    print("\n4. Baseline Comparison")
    print("-" * 80)
    print(f"Average performance improvement: {avg_improvement:.2f}%")
    print(f"Best improved metric: {comparison_df.loc[comparison_df['Improvement %'].idxmax(), 'Metric']}")
    print(f"Improvement amount: {comparison_df['Improvement %'].max():.2f}%")
except:
    print("\n4. Baseline comparison data not available")

print("\n5. Next Steps")
print("-" * 80)
print("Full model training complete. Next:")
print("")
print("1. 07_results_analysis.ipynb")
print("   - Detailed model comparison analysis")
print("   - Deep dive into OCEAN features")
print("   - Business insights and recommendations")
print("   - Generate final report")
print("")
print("=" * 80)

print("\nFull model training complete")