In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report

In [2]:
# ───────────────
# 1) Load the loan data
# ───────────────
# Assumes 'Loan_Data.csv' is in your working directory
df = pd.read_csv('Loan_Data.csv')

In [3]:
# ───────────────
# 2) Prepare features & target
# ───────────────
feature_cols = [
    'credit_lines_outstanding',
    'loan_amt_outstanding',
    'total_debt_outstanding',
    'income',
    'years_employed',
    'fico_score'
]
X = df[feature_cols]
y = df['default']  # 0 = no default, 1 = default

In [4]:
# ───────────────
# 3) Split into train/test (stratified)
# ───────────────
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [7]:
# ───────────────
# 4a) Train Logistic Regression
# ───────────────
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_prob_lr = lr.predict_proba(X_test)[:, 1]
auc_lr = roc_auc_score(y_test, y_prob_lr)

In [8]:
# ───────────────
# 4b) Train Random Forest
# ───────────────
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_prob_rf = rf.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, y_prob_rf)

In [9]:
# ───────────────
# 5) Compare Model Performance
# ───────────────
print("ROC AUC scores:")
print(f"  Logistic Regression: {auc_lr:.3f}")
print(f"  Random Forest:       {auc_rf:.3f}\n")

print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr.predict(X_test)))

print("Random Forest Classification Report:")
print(classification_report(y_test, rf.predict(X_test)))

ROC AUC scores:
  Logistic Regression: 1.000
  Random Forest:       1.000

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2445
           1       1.00      0.99      1.00       555

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2445
           1       0.99      0.99      0.99       555

    accuracy                           1.00      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       1.00      1.00      1.00      3000



In [10]:
# ───────────────
# 6) Define Expected-Loss Function
# ───────────────
def predict_expected_loss(
    borrower_features: dict,
    loan_amount: float,
    recovery_rate: float = 0.1,
    model=lr
) -> float:
    """
    borrower_features: dict mapping feature names to values
    loan_amount: exposure at default (EAD)
    recovery_rate: fraction recovered on default
    model: one of the trained classifiers (lr or rf)
    Returns:
      expected loss = PD * EAD * (1 - recovery_rate)
    """
    X_new = pd.DataFrame([borrower_features])
    pd_prob = model.predict_proba(X_new)[0, 1]
    return pd_prob * loan_amount * (1 - recovery_rate)

In [11]:
# ───────────────
# 7) Example Usage
# ───────────────
# Take the first test record as a sample borrower
sample_idx = X_test.index[0]
sample_features = X_test.loc[sample_idx].to_dict()
sample_loan_amount = 10_000  # e.g. $10,000 new loan

loss_lr = predict_expected_loss(sample_features, sample_loan_amount, model=lr)
loss_rf = predict_expected_loss(sample_features, sample_loan_amount, model=rf)

print(f"\nSample borrower expected loss on a ${sample_loan_amount:,} loan:")
print(f"  Logistic Regression model: ${loss_lr:,.2f}")
print(f"  Random Forest model:       ${loss_rf:,.2f}")


Sample borrower expected loss on a $10,000 loan:
  Logistic Regression model: $0.00
  Random Forest model:       $0.00
