# Big Five (OCEAN) Personality Features Integration

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Import project modules
from utils.seed import set_seed, get_seed
from utils.metrics import compute_all_metrics, delong_test, bootstrap_ci, compute_lift
from utils.io import load_lending_club_data, prepare_binary_target
from text_features.personality import OceanScorer, OCEAN_DIMS

# Set random seed for reproducibility
set_seed(42)

# Plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Data Loading & Text Coverage Analysis

In [None]:
# Load data
path = kagglehub.dataset_download("ethon0426/lending-club-20072020q1")
file_path = path + "/Loan_status_2007-2020Q3.gzip"

# Start with 10k samples (adjust as needed)
ROW_LIMIT = 10000

df = load_lending_club_data(file_path, row_limit=ROW_LIMIT)
df = prepare_binary_target(df, target_col="loan_status")

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()[:10]}...")

In [None]:
# Check available text fields
text_fields = ['desc', 'title', 'purpose', 'emp_title']
available_fields = [f for f in text_fields if f in df.columns]

print(f"Available text fields: {available_fields}\n")

# Coverage analysis
coverage_stats = {}
for field in available_fields:
    non_null = df[field].notna().sum()
    coverage = non_null / len(df) * 100
    avg_len = df[field].dropna().str.len().mean()
    coverage_stats[field] = {
        'non_null': non_null,
        'coverage_pct': coverage,
        'avg_length': avg_len
    }
    print(f"{field}: {non_null}/{len(df)} ({coverage:.1f}%) | Avg length: {avg_len:.1f} chars")

coverage_df = pd.DataFrame(coverage_stats).T
coverage_df

In [None]:
# Merge text fields for OCEAN scoring
# Use title (always available) + emp_title (high coverage)
df['title_clean'] = df['title'].fillna('').astype(str).str.strip()
df['emp_title_clean'] = df['emp_title'].fillna('').astype(str).str.strip()

# Create combined text for analysis
df['text_merged'] = df['title_clean'] + ' | ' + df['emp_title_clean']
df['text_length'] = df['text_merged'].str.len()

print(f"Text length statistics:")
print(df['text_length'].describe())

In [None]:
# Visualize text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Text Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Text Length')

# Box plot by loan grade
if 'grade' in df.columns:
    grade_order = sorted(df['grade'].dropna().unique())
    sns.boxplot(data=df, x='grade', y='text_length', order=grade_order, ax=axes[1])
    axes[1].set_xlabel('Loan Grade')
    axes[1].set_ylabel('Text Length')
    axes[1].set_title('Text Length by Loan Grade')

plt.tight_layout()
plt.savefig('../artifacts/results/text_coverage_analysis.png', dpi=150)
plt.show()

## 2. OCEAN Personality Scoring

We'll start in **offline mode** (deterministic fallback) to build the pipeline, then optionally enable API mode.

In [None]:
# Initialize OCEAN scorer
# Set offline_mode=False and provide OPENAI_API_KEY to use real LLM scoring
scorer = OceanScorer(
    cache_dir="../artifacts/persona_cache",
    offline_mode=True,  # Set to False to enable API calls
    max_chars=800
)

print("OCEAN Scorer initialized (offline mode)")
print(f"Dimensions: {OCEAN_DIMS}")

In [None]:
# Test scoring on a few samples
print("Testing OCEAN scorer on sample data:\n")

sample_df = df.head(5)[['title_clean', 'emp_title_clean']]

for idx, row in sample_df.iterrows():
    scores = scorer.score(row['title_clean'], row['emp_title_clean'])
    print(f"Sample {idx}:")
    print(f"  Title: {row['title_clean'][:50]}")
    print(f"  Emp: {row['emp_title_clean'][:50]}")
    print(f"  Scores: {scores}")
    print()

In [None]:
# Batch scoring on full dataset
print(f"Scoring {len(df)} samples...\n")

titles = df['title_clean'].tolist()
emp_titles = df['emp_title_clean'].tolist()

ocean_scores = scorer.score_batch(titles, emp_titles, rate_limit_delay=0.5)

# Convert to DataFrame
ocean_df = pd.DataFrame(ocean_scores)

print(f"\nScoring complete!")
print(f"Stats: {scorer.get_stats()}")
print(f"\nOCEAN scores preview:")
print(ocean_df.head())

In [None]:
# Add OCEAN features to main dataframe
for dim in OCEAN_DIMS:
    df[dim] = ocean_df[dim]

print("OCEAN features added to dataset")
print(f"New columns: {OCEAN_DIMS}")
print(f"\nOCEAN descriptive statistics:")
print(df[OCEAN_DIMS].describe())

In [None]:
# Visualize OCEAN distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, dim in enumerate(OCEAN_DIMS):
    axes[i].hist(df[dim], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    axes[i].set_xlabel(dim.capitalize())
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Distribution of {dim.capitalize()}')
    axes[i].axvline(df[dim].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[dim].mean():.2f}')
    axes[i].legend()

# Remove extra subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.savefig('../artifacts/results/ocean_distributions.png', dpi=150)
plt.show()

In [None]:
# Correlation matrix of OCEAN features
ocean_corr = df[OCEAN_DIMS].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(ocean_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('OCEAN Features Correlation Matrix')
plt.tight_layout()
plt.savefig('../artifacts/results/ocean_correlation.png', dpi=150)
plt.show()

## 3. Feature Engineering & Preprocessing

In [None]:
# Define baseline features (from your original notebooks)
numeric_features = [
    "loan_amnt", "int_rate", "installment", "annual_inc", "dti",
    "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "revol_util",
    "total_acc"
]

categorical_features = [
    "term", "grade", "sub_grade", "emp_length", "home_ownership",
    "verification_status", "purpose", "application_type"
]

# Filter to available columns
numeric_features = [c for c in numeric_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

# Clean percentage columns
for col in ["int_rate", "revol_util"]:
    if col in df.columns and df[col].dtype == object:
        df[col] = pd.to_numeric(df[col].astype(str).str.rstrip("%"), errors="coerce")

print(f"Baseline numeric features ({len(numeric_features)}): {numeric_features}")
print(f"Baseline categorical features ({len(categorical_features)}): {categorical_features}")

In [None]:
# Define feature sets for A/B comparison
# A: Baseline features only
features_baseline = numeric_features + categorical_features

# B: Baseline + OCEAN
features_with_ocean = numeric_features + OCEAN_DIMS + categorical_features

print(f"Feature Set A (Baseline): {len(features_baseline)} features")
print(f"Feature Set B (Baseline + OCEAN): {len(features_with_ocean)} features")
print(f"\nAdded OCEAN features: {OCEAN_DIMS}")

In [None]:
# Train-test split (same seed as baseline)
X = df[features_with_ocean].copy()
y = df["target"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Default rate (train): {y_train.mean():.3f}")
print(f"Default rate (test): {y_test.mean():.3f}")

## 4. Model Training: A/B Comparison

### 4.1 Logistic Regression

In [None]:
def build_logreg_pipeline(numeric_cols, categorical_cols):
    """Build LogReg pipeline with preprocessing."""
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols),
        ],
        remainder="drop"
    )

    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", LogisticRegression(solver="lbfgs", max_iter=500, class_weight="balanced", random_state=42))
    ])

    return model

In [None]:
# Model A: Baseline LogReg
print("Training Model A: LogReg Baseline\n")

X_train_a = X_train[features_baseline]
X_test_a = X_test[features_baseline]

numeric_a = [f for f in numeric_features if f in features_baseline]
categorical_a = [f for f in categorical_features if f in features_baseline]

model_logreg_a = build_logreg_pipeline(numeric_a, categorical_a)
model_logreg_a.fit(X_train_a, y_train)

y_proba_logreg_a = model_logreg_a.predict_proba(X_test_a)[:, 1]
metrics_logreg_a = compute_all_metrics(y_test, y_proba_logreg_a)

print("Model A Results:")
for k, v in metrics_logreg_a.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Model B: LogReg with OCEAN
print("Training Model B: LogReg + OCEAN\n")

X_train_b = X_train[features_with_ocean]
X_test_b = X_test[features_with_ocean]

numeric_b = [f for f in numeric_features + OCEAN_DIMS if f in features_with_ocean]
categorical_b = [f for f in categorical_features if f in features_with_ocean]

model_logreg_b = build_logreg_pipeline(numeric_b, categorical_b)
model_logreg_b.fit(X_train_b, y_train)

y_proba_logreg_b = model_logreg_b.predict_proba(X_test_b)[:, 1]
metrics_logreg_b = compute_all_metrics(y_test, y_proba_logreg_b)

print("Model B Results:")
for k, v in metrics_logreg_b.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Compare LogReg A vs B
print("\n=== LogReg: Baseline vs Baseline+OCEAN ===")
print(f"{'Metric':<15} {'A (Baseline)':<15} {'B (+OCEAN)':<15} {'Delta':<15}")
print("-" * 60)

for key in ['roc_auc', 'pr_auc', 'ks', 'brier', 'ece']:
    a_val = metrics_logreg_a[key]
    b_val = metrics_logreg_b[key]
    delta = b_val - a_val
    print(f"{key:<15} {a_val:<15.4f} {b_val:<15.4f} {delta:+.4f}")

# Statistical test
z_stat, p_val = delong_test(y_test, y_proba_logreg_a, y_proba_logreg_b)
print(f"\nDeLong test: z={z_stat:.3f}, p={p_val:.4f}")
print(f"Statistically significant at α=0.05: {p_val < 0.05}")

### 4.2 XGBoost

In [None]:
def build_xgb_pipeline(numeric_cols, categorical_cols):
    """Build XGBoost pipeline with preprocessing."""
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3
    )

    # Calculate scale_pos_weight
    pos = int((y_train == 1).sum())
    neg = int((y_train == 0).sum())
    scale_pos_weight = neg / max(1, pos)

    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", XGBClassifier(
            objective="binary:logistic",
            tree_method="hist",
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            scale_pos_weight=scale_pos_weight,
            random_state=42,
            eval_metric="auc"
        ))
    ])

    return model

In [None]:
# Model A: Baseline XGBoost
print("Training Model A: XGBoost Baseline\n")

model_xgb_a = build_xgb_pipeline(numeric_a, categorical_a)
model_xgb_a.fit(X_train_a, y_train)

y_proba_xgb_a = model_xgb_a.predict_proba(X_test_a)[:, 1]
metrics_xgb_a = compute_all_metrics(y_test, y_proba_xgb_a)

print("Model A Results:")
for k, v in metrics_xgb_a.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Model B: XGBoost with OCEAN
print("Training Model B: XGBoost + OCEAN\n")

model_xgb_b = build_xgb_pipeline(numeric_b, categorical_b)
model_xgb_b.fit(X_train_b, y_train)

y_proba_xgb_b = model_xgb_b.predict_proba(X_test_b)[:, 1]
metrics_xgb_b = compute_all_metrics(y_test, y_proba_xgb_b)

print("Model B Results:")
for k, v in metrics_xgb_b.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Compare XGBoost A vs B
print("\n=== XGBoost: Baseline vs Baseline+OCEAN ===")
print(f"{'Metric':<15} {'A (Baseline)':<15} {'B (+OCEAN)':<15} {'Delta':<15}")
print("-" * 60)

for key in ['roc_auc', 'pr_auc', 'ks', 'brier', 'ece']:
    a_val = metrics_xgb_a[key]
    b_val = metrics_xgb_b[key]
    delta = b_val - a_val
    print(f"{key:<15} {a_val:<15.4f} {b_val:<15.4f} {delta:+.4f}")

# Statistical test
z_stat, p_val = delong_test(y_test, y_proba_xgb_a, y_proba_xgb_b)
print(f"\nDeLong test: z={z_stat:.3f}, p={p_val:.4f}")
print(f"Statistically significant at α=0.05: {p_val < 0.05}")

## 5. Cross-Validation Analysis

In [None]:
# 5-fold CV for XGBoost (more stable estimates)
print("Running 5-Fold Cross-Validation...\n")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Baseline
X_baseline = df[features_baseline].copy()
cv_scores_a = cross_val_score(model_xgb_a, X_baseline, df['target'], 
                               cv=cv, scoring='roc_auc', n_jobs=-1)

# With OCEAN
X_ocean = df[features_with_ocean].copy()
cv_scores_b = cross_val_score(model_xgb_b, X_ocean, df['target'], 
                               cv=cv, scoring='roc_auc', n_jobs=-1)

print(f"Baseline ROC-AUC: {cv_scores_a.mean():.4f} ± {cv_scores_a.std():.4f}")
print(f"With OCEAN ROC-AUC: {cv_scores_b.mean():.4f} ± {cv_scores_b.std():.4f}")
print(f"Mean improvement: {(cv_scores_b.mean() - cv_scores_a.mean()):.4f}")

# Paired t-test
from scipy import stats
t_stat, t_pval = stats.ttest_rel(cv_scores_b, cv_scores_a)
print(f"\nPaired t-test: t={t_stat:.3f}, p={t_pval:.4f}")
print(f"Improvement is significant at α=0.05: {t_pval < 0.05}")

## 6. Visualizations

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# ROC Curve
fpr_a, tpr_a, _ = roc_curve(y_test, y_proba_xgb_a)
fpr_b, tpr_b, _ = roc_curve(y_test, y_proba_xgb_b)
axes[0, 0].plot(fpr_a, tpr_a, label=f"Baseline (AUC={auc(fpr_a, tpr_a):.3f})", linewidth=2)
axes[0, 0].plot(fpr_b, tpr_b, label=f"+ OCEAN (AUC={auc(fpr_b, tpr_b):.3f})", linewidth=2)
axes[0, 0].plot([0, 1], [0, 1], 'k--', linewidth=1)
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curve Comparison')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Precision-Recall Curve
prec_a, rec_a, _ = precision_recall_curve(y_test, y_proba_xgb_a)
prec_b, rec_b, _ = precision_recall_curve(y_test, y_proba_xgb_b)
axes[0, 1].plot(rec_a, prec_a, label=f"Baseline (PR-AUC={metrics_xgb_a['pr_auc']:.3f})", linewidth=2)
axes[0, 1].plot(rec_b, prec_b, label=f"+ OCEAN (PR-AUC={metrics_xgb_b['pr_auc']:.3f})", linewidth=2)
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('Precision-Recall Curve Comparison')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Lift Curve
lift_a = compute_lift(y_test, y_proba_xgb_a, n_deciles=10)
lift_b = compute_lift(y_test, y_proba_xgb_b, n_deciles=10)
deciles = np.arange(1, 11)
axes[1, 0].plot(deciles, lift_a, 'o-', label='Baseline', linewidth=2)
axes[1, 0].plot(deciles, lift_b, 's-', label='+ OCEAN', linewidth=2)
axes[1, 0].axhline(1.0, color='k', linestyle='--', linewidth=1)
axes[1, 0].set_xlabel('Decile')
axes[1, 0].set_ylabel('Lift')
axes[1, 0].set_title('Lift Curve by Decile')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Metric Comparison Bar Chart
metrics_names = ['ROC-AUC', 'PR-AUC', 'KS']
baseline_vals = [metrics_xgb_a['roc_auc'], metrics_xgb_a['pr_auc'], metrics_xgb_a['ks']/100]
ocean_vals = [metrics_xgb_b['roc_auc'], metrics_xgb_b['pr_auc'], metrics_xgb_b['ks']/100]

x_pos = np.arange(len(metrics_names))
width = 0.35
axes[1, 1].bar(x_pos - width/2, baseline_vals, width, label='Baseline', alpha=0.8)
axes[1, 1].bar(x_pos + width/2, ocean_vals, width, label='+ OCEAN', alpha=0.8)
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(metrics_names)
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_title('Metric Comparison')
axes[1, 1].legend()
axes[1, 1].grid(True, axis='y')

plt.tight_layout()
plt.savefig('../artifacts/results/model_comparison_plots.png', dpi=150)
plt.show()

## 7. Save Results & Models

In [None]:
# Save metrics to CSV
from datetime import datetime

results_summary = pd.DataFrame([
    {'model': 'LogReg_Baseline', **metrics_logreg_a},
    {'model': 'LogReg_OCEAN', **metrics_logreg_b},
    {'model': 'XGB_Baseline', **metrics_xgb_a},
    {'model': 'XGB_OCEAN', **metrics_xgb_b}
])

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_path = f'../artifacts/results/metrics_{timestamp}.csv'
results_summary.to_csv(results_path, index=False)
print(f"Results saved to {results_path}")

results_summary

In [None]:
# Save best model (XGBoost + OCEAN)
import os
os.makedirs('../artifacts', exist_ok=True)

model_path = '../artifacts/xgb_ocean_model.joblib'
joblib.dump(model_xgb_b, model_path)
print(f"Best model saved to {model_path}")

## 8. Summary & Conclusions

### Acceptance Criteria Check

**Target**: Meet at least one of:
- ROC-AUC improvement ≥ +0.010
- PR-AUC improvement ≥ +0.008
- KS improvement ≥ +1.0

**Results**: (Fill in after running)
- ROC-AUC delta: _____
- PR-AUC delta: _____
- KS delta: _____

### Next Steps
1. If metrics improve: Move to [04_explain_shap.ipynb](04_explain_shap.ipynb) for interpretability analysis
2. If using offline mode: Enable API mode (`offline_mode=False`) and re-run with real LLM scores
3. Scale to full dataset (100k+ samples) for production validation
4. Consider alternative text sources or enrichment strategies

### Data Limitations
- Dataset lacks borrower descriptions (original Yu et al. 2023 paper used self-written text)
- Using loan title + employment title as weak personality proxies
- Results should be interpreted as **proof-of-concept** for the technical framework
- For production use, richer text data would be needed