# OCEAN Features: Simple Demo (Using Existing Categorical Features)

This notebook demonstrates OCEAN scoring using **only existing categorical features** - no free text required.

**Input Features**:
- `term`, `grade`, `sub_grade`, `emp_length`, `home_ownership`
- `verification_status`, `purpose`, `application_type`

**Output**: 5 OCEAN dimensions (0-1 scale)

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import warnings
warnings.filterwarnings('ignore')

from text_features.personality_simple import SimplifiedOceanScorer, build_borrower_profile, OCEAN_DIMS
from utils.seed import set_seed

set_seed(42)
sns.set_style('whitegrid')

## 1. Load Data

In [None]:
# Load data
path = kagglehub.dataset_download("ethon0426/lending-club-20072020q1")
file_path = path + "/Loan_status_2007-2020Q3.gzip"

# Use 5k samples for quick demo
ROW_LIMIT = 5000

df = pd.read_csv(file_path, nrows=ROW_LIMIT, low_memory=False, compression="infer")

# Prepare target
df = df[df['loan_status'].isin(["Fully Paid", "Charged Off"])].copy()
df['target'] = (df['loan_status'] == "Charged Off").astype(int)

print(f"Dataset: {len(df)} rows")
print(f"Default rate: {df['target'].mean():.2%}")
print(f"\nColumns: {df.columns.tolist()[:20]}...")

## 2. Check Available Categorical Features

In [None]:
# Features we'll use for OCEAN scoring
text_features = [
    "term", "grade", "sub_grade", "emp_length", "home_ownership",
    "verification_status", "purpose", "application_type"
]

available = [f for f in text_features if f in df.columns]
print(f"Available features: {available}\n")

# Check coverage
for feat in available:
    non_null = df[feat].notna().sum()
    coverage = non_null / len(df) * 100
    n_unique = df[feat].nunique()
    print(f"{feat:25s}: {coverage:5.1f}% coverage, {n_unique:3d} unique values")

## 3. Build Borrower Profiles (Test Function)

In [None]:
# Test profile building on 5 samples
print("Sample Borrower Profiles:\n")

for idx in range(5):
    row = df.iloc[idx]
    profile = build_borrower_profile(row)
    print(f"Sample {idx+1}:")
    print(f"  {profile}")
    print(f"  Default: {row['target']}")
    print()

## 4. Initialize OCEAN Scorer (Offline Mode)

In [None]:
# Start in offline mode (deterministic, no API calls)
scorer = SimplifiedOceanScorer(
    cache_dir="../artifacts/persona_cache_simple",
    offline_mode=True  # Set to False to enable OpenAI API
)

print("SimplifiedOceanScorer initialized (offline mode)")
print(f"Cache directory: {scorer.cache_dir}")

## 5. Score Single Borrower (Test)

In [None]:
# Test on first row
test_row = df.iloc[0]
profile = build_borrower_profile(test_row)

print("Test Borrower Profile:")
print(profile)
print()

scores = scorer.score_row(test_row)

print("OCEAN Scores:")
for dim, score in scores.items():
    print(f"  {dim:20s}: {score:.3f}")

## 6. Batch Scoring (Full Dataset)

In [None]:
# Score all borrowers
print(f"Scoring {len(df)} borrowers...\n")

ocean_scores = scorer.score_batch(df, rate_limit_delay=0.5)

# Convert to DataFrame
ocean_df = pd.DataFrame(ocean_scores)

print(f"\nScoring complete!")
print(f"Stats: {scorer.get_stats()}")
print(f"\nOCEAN scores preview:")
print(ocean_df.head(10))

In [None]:
# Add to main dataframe
for dim in OCEAN_DIMS:
    df[dim] = ocean_df[dim]

print("OCEAN features added to dataset")
print(f"\nDescriptive statistics:")
print(df[OCEAN_DIMS].describe())

## 7. Visualize OCEAN Distributions

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, dim in enumerate(OCEAN_DIMS):
    axes[i].hist(df[dim], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    axes[i].set_xlabel(dim.capitalize())
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{dim.capitalize()} Distribution')
    axes[i].axvline(df[dim].mean(), color='red', linestyle='--', linewidth=2, 
                    label=f'Mean: {df[dim].mean():.2f}')
    axes[i].legend()

fig.delaxes(axes[5])
plt.tight_layout()
plt.savefig('../artifacts/results/ocean_distributions_simple.png', dpi=150)
plt.show()

## 8. OCEAN vs Default (Quick Analysis)

In [None]:
# Compare OCEAN scores between defaulted and fully paid
ocean_by_target = df.groupby('target')[OCEAN_DIMS].mean()

print("OCEAN Scores by Default Status:\n")
print(ocean_by_target)
print("\nDifference (Charged Off - Fully Paid):")
print(ocean_by_target.loc[1] - ocean_by_target.loc[0])

In [None]:
# Visualize differences
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(OCEAN_DIMS))
width = 0.35

ax.bar(x - width/2, ocean_by_target.loc[0], width, label='Fully Paid', alpha=0.8)
ax.bar(x + width/2, ocean_by_target.loc[1], width, label='Charged Off', alpha=0.8)

ax.set_xlabel('OCEAN Dimension')
ax.set_ylabel('Mean Score')
ax.set_title('OCEAN Scores: Fully Paid vs Charged Off')
ax.set_xticks(x)
ax.set_xticklabels([d.capitalize() for d in OCEAN_DIMS], rotation=45)
ax.legend()
ax.grid(True, axis='y')

plt.tight_layout()
plt.savefig('../artifacts/results/ocean_by_default_simple.png', dpi=150)
plt.show()

## 9. Correlation with Credit Grade

In [None]:
# Check if OCEAN varies by credit grade
if 'grade' in df.columns:
    ocean_by_grade = df.groupby('grade')[OCEAN_DIMS].mean()
    
    print("OCEAN Scores by Credit Grade:\n")
    print(ocean_by_grade)
    
    # Heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(ocean_by_grade.T, annot=True, fmt='.2f', cmap='RdYlGn', center=0.5)
    plt.title('OCEAN Scores by Credit Grade')
    plt.xlabel('Credit Grade')
    plt.ylabel('OCEAN Dimension')
    plt.tight_layout()
    plt.savefig('../artifacts/results/ocean_by_grade_simple.png', dpi=150)
    plt.show()

## 10. Save Enhanced Dataset

In [None]:
# Save dataset with OCEAN features
output_path = '../artifacts/results/data_with_ocean_simple.csv'
df.to_csv(output_path, index=False)
print(f"Enhanced dataset saved to {output_path}")
print(f"Shape: {df.shape}")
print(f"New OCEAN columns: {OCEAN_DIMS}")

## Summary

**What We Did**:
1. ✅ Used **existing categorical features** (no free text needed)
2. ✅ Built structured borrower profiles (term, grade, purpose, etc.)
3. ✅ Generated OCEAN scores for 5k borrowers (with caching)
4. ✅ Visualized distributions and relationships

**Key Advantages**:
- No dependency on missing `desc` field
- Uses data already in baseline models
- Can run offline (deterministic fallback)
- Easy to extend with more categorical features

**Next Steps**:
1. Run A/B comparison (Baseline vs Baseline+OCEAN)
2. Check if OCEAN improves ROC-AUC/PR-AUC
3. If promising, enable API mode for real LLM scoring