# Credit Risk Analytics - LGD, EAD and Expected Loss Modeling

This notebook covers:
- Loss Given Default (LGD) modeling using two-stage regression
- Exposure at Default (EAD) modeling using linear regression
- Expected Loss (EL) calculation: EL = PD x LGD x EAD
- Portfolio-level capital requirement estimation (Basel II)
- Stress testing under adverse economic scenarios


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pickle
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.4f}'.format)

print("Libraries loaded successfully")


In [None]:
train = pd.read_csv('../data/train_preprocessed.csv')
test  = pd.read_csv('../data/test_preprocessed.csv')

# Drop extra index column if present
for df in [train, test]:
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

# Load saved PD model and scaler
with open('../data/pd_model.pkl', 'rb') as f:
    pd_model = pickle.load(f)

with open('../data/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

feature_cols = [c for c in train.columns if c != 'target']

X_train = train[feature_cols].fillna(train[feature_cols].median())
y_train = train['target']
X_test  = test[feature_cols].fillna(test[feature_cols].median())
y_test  = test['target']

X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Get PD predictions from saved model
train_pd = pd_model.predict_proba(X_train_scaled)[:, 1]
test_pd  = pd_model.predict_proba(X_test_scaled)[:, 1]

print(f"Training set: {train.shape}")
print(f"Test set:     {test.shape}")
print(f"PD loaded - Training mean PD: {train_pd.mean():.4f}")


In [None]:
raw = pd.read_csv(
    '../data/accepted_2007_to_2018Q4.csv.gz',
    compression='gzip',
    low_memory=False,
    nrows=500000,
    usecols=['loan_status', 'loan_amnt', 'funded_amnt', 'total_pymnt',
             'recoveries', 'funded_amnt_inv', 'total_rec_prncp',
             'out_prncp', 'issue_d', 'grade', 'term', 'int_rate']
)

good_statuses = ['Fully Paid', 'Current', 'In Grace Period']
bad_statuses  = ['Charged Off', 'Default', 'Late (31-120 days)',
                 'Late (16-30 days)',
                 'Does not meet the credit policy. Status:Charged Off']

raw = raw[raw['loan_status'].isin(good_statuses + bad_statuses)].copy()
raw['target'] = np.where(raw['loan_status'].isin(bad_statuses), 1, 0)

# Clean numeric columns
if raw['int_rate'].dtype == object:
    raw['int_rate'] = raw['int_rate'].str.replace('%', '').astype(float)
if raw['term'].dtype == object:
    raw['term'] = raw['term'].str.extract(r'(\d+)').astype(int)

raw['issue_d'] = pd.to_datetime(raw['issue_d'], format='%b-%Y', errors='coerce')
raw['issue_year'] = raw['issue_d'].dt.year

print(f"Raw data shape: {raw.shape}")
print(f"Defaulted loans: {raw['target'].sum():,}")


In [None]:
defaulted = raw[raw['target'] == 1].copy()

# Calculate LGD
defaulted['recovery_rate'] = (defaulted['total_pymnt'] / 
                               defaulted['funded_amnt']).clip(0, 1)
defaulted['lgd'] = 1 - defaulted['recovery_rate']
defaulted['lgd'] = defaulted['lgd'].clip(0, 1)

# Remove NaN LGD values
defaulted = defaulted.dropna(subset=['lgd'])

print(f"Defaulted loans with LGD: {len(defaulted):,}")
print(f"\n=== LGD Distribution ===")
print(defaulted['lgd'].describe())


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# LGD histogram
axes[0].hist(defaulted['lgd'], bins=50, color='tomato',
             edgecolor='black', linewidth=0.5, alpha=0.8)
axes[0].set_xlabel('Loss Given Default (LGD)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('LGD Distribution - Defaulted Loans', fontsize=13, fontweight='bold')
axes[0].axvline(defaulted['lgd'].mean(), color='black', linestyle='--',
                linewidth=2, label=f"Mean LGD: {defaulted['lgd'].mean():.3f}")
axes[0].legend()

# LGD by grade
lgd_by_grade = defaulted.groupby('grade')['lgd'].mean().sort_index()
axes[1].bar(lgd_by_grade.index, lgd_by_grade.values,
            color='steelblue', edgecolor='black', linewidth=0.8)
axes[1].set_xlabel('Loan Grade', fontsize=12)
axes[1].set_ylabel('Average LGD', fontsize=12)
axes[1].set_title('Average LGD by Loan Grade', fontsize=13, fontweight='bold')
axes[1].set_ylim(0, 1)

plt.suptitle('Loss Given Default Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('lgd_analysis.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Prepare LGD features
lgd_features = ['loan_amnt', 'int_rate', 'term']
lgd_features = [f for f in lgd_features if f in defaulted.columns]

# Fill missing values
for col in lgd_features:
    defaulted[col].fillna(defaulted[col].median(), inplace=True)

X_lgd = defaulted[lgd_features].values
y_lgd = defaulted['lgd'].values

# Train/test split for LGD (temporal)
defaulted['issue_year'] = defaulted['issue_d'].dt.year
lgd_train_mask = defaulted['issue_year'] <= 2015
lgd_test_mask  = defaulted['issue_year'] > 2015

X_lgd_train = defaulted.loc[lgd_train_mask, lgd_features]
X_lgd_test  = defaulted.loc[lgd_test_mask, lgd_features]
y_lgd_train = defaulted.loc[lgd_train_mask, 'lgd']
y_lgd_test  = defaulted.loc[lgd_test_mask, 'lgd']

# Fill NaN
X_lgd_train = X_lgd_train.fillna(X_lgd_train.median())
X_lgd_test  = X_lgd_test.fillna(X_lgd_train.median())

print(f"LGD Training samples: {len(X_lgd_train):,}")
print(f"LGD Test samples:     {len(X_lgd_test):,}")

# Stage 1: Classify total loss (LGD = 1) vs partial recovery
y_lgd_stage1_train = (y_lgd_train == 1).astype(int)
y_lgd_stage1_test  = (y_lgd_test == 1).astype(int)

lgd_scaler = StandardScaler()
X_lgd_train_scaled = lgd_scaler.fit_transform(X_lgd_train)
X_lgd_test_scaled  = lgd_scaler.transform(X_lgd_test)

lgd_stage1 = LogisticRegression(class_weight='balanced', max_iter=500, random_state=42)
lgd_stage1.fit(X_lgd_train_scaled, y_lgd_stage1_train)

stage1_auc = roc_auc_score(y_lgd_stage1_test, 
                            lgd_stage1.predict_proba(X_lgd_test_scaled)[:, 1])
print(f"\nStage 1 (Total Loss Classification) AUC: {stage1_auc:.4f}")

# Stage 2: Regression on partial recovery cases
partial_train = y_lgd_train < 1
partial_test  = y_lgd_test < 1

lgd_stage2 = LinearRegression()
lgd_stage2.fit(X_lgd_train_scaled[partial_train], y_lgd_train[partial_train])

stage2_r2  = r2_score(y_lgd_test[partial_test],
                       lgd_stage2.predict(X_lgd_test_scaled[partial_test]))
stage2_mae = mean_absolute_error(y_lgd_test[partial_test],
                                  lgd_stage2.predict(X_lgd_test_scaled[partial_test]))

print(f"Stage 2 (Recovery Regression) R2:  {stage2_r2:.4f}")
print(f"Stage 2 (Recovery Regression) MAE: {stage2_mae:.4f}")


In [None]:
defaulted['ead'] = (defaulted['funded_amnt'] - 
                    defaulted['total_rec_prncp'].fillna(0)).clip(0)
defaulted['ead_rate'] = (defaulted['ead'] / 
                          defaulted['funded_amnt']).clip(0, 1)

print("=== EAD Distribution ===")
print(defaulted['ead'].describe())
print(f"\nMean EAD Rate: {defaulted['ead_rate'].mean():.4f}")

# EAD model: predict EAD rate using loan features
ead_features = ['loan_amnt', 'int_rate', 'term']
ead_features = [f for f in ead_features if f in defaulted.columns]

X_ead_train = defaulted.loc[lgd_train_mask, ead_features].fillna(
    defaulted.loc[lgd_train_mask, ead_features].median())
X_ead_test  = defaulted.loc[lgd_test_mask, ead_features].fillna(
    X_ead_train.median())
y_ead_train = defaulted.loc[lgd_train_mask, 'ead_rate']
y_ead_test  = defaulted.loc[lgd_test_mask, 'ead_rate']

ead_scaler = StandardScaler()
X_ead_train_scaled = ead_scaler.fit_transform(X_ead_train)
X_ead_test_scaled  = ead_scaler.transform(X_ead_test)

ead_model = LinearRegression()
ead_model.fit(X_ead_train_scaled, y_ead_train)

ead_r2  = r2_score(y_ead_test, ead_model.predict(X_ead_test_scaled))
ead_mae = mean_absolute_error(y_ead_test, ead_model.predict(X_ead_test_scaled))

print(f"\nEAD Model R2:  {ead_r2:.4f}")
print(f"EAD Model MAE: {ead_mae:.4f}")

In [None]:
mean_lgd = defaulted['lgd'].mean()
mean_ead_rate = defaulted['ead_rate'].mean()

# Calculate EL for each loan in training set
train_el = train_pd * mean_lgd * mean_ead_rate * train['loan_amnt'].fillna(
    train['loan_amnt'].median())

print("=== Expected Loss (EL) Analysis ===")
print(f"Mean PD:       {train_pd.mean():.4f} ({train_pd.mean()*100:.2f}%)")
print(f"Mean LGD:      {mean_lgd:.4f} ({mean_lgd*100:.2f}%)")
print(f"Mean EAD Rate: {mean_ead_rate:.4f} ({mean_ead_rate*100:.2f}%)")
print(f"\nMean EL per loan: ${train_el.mean():,.2f}")
print(f"Total Portfolio EL: ${train_el.sum():,.0f}")
print(f"Portfolio EL Rate: {(train_el.sum() / (train['loan_amnt'].fillna(0) * len(train)/(len(train)))).mean()*100:.4f}%")

In [None]:
scorecard = pd.read_csv('../data/scorecard_output.csv')

el_df = pd.DataFrame({
    'pd_score': train_pd,
    'expected_loss': train_el.values,
    'loan_amnt': train['loan_amnt'].fillna(train['loan_amnt'].median()).values
})

el_df['pd_decile'] = pd.qcut(el_df['pd_score'], 10, labels=False, duplicates='drop')

decile_el = el_df.groupby('pd_decile').agg(
    avg_pd=('pd_score', 'mean'),
    avg_el=('expected_loss', 'mean'),
    total_el=('expected_loss', 'sum')
).round(2)

print("=== Expected Loss by PD Decile ===")
print(decile_el)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Average EL by decile
axes[0].bar(decile_el.index, decile_el['avg_el'],
            color='tomato', edgecolor='black', linewidth=0.8)
axes[0].set_xlabel('PD Decile (0=Lowest Risk, 9=Highest Risk)', fontsize=11)
axes[0].set_ylabel('Average Expected Loss ($)', fontsize=11)
axes[0].set_title('Average Expected Loss by PD Decile', fontsize=12, fontweight='bold')

# EL distribution
axes[1].hist(train_el, bins=50, color='steelblue',
             edgecolor='black', linewidth=0.5, alpha=0.8)
axes[1].set_xlabel('Expected Loss per Loan ($)', fontsize=11)
axes[1].set_ylabel('Count', fontsize=11)
axes[1].set_title('Expected Loss Distribution', fontsize=12, fontweight='bold')
axes[1].axvline(train_el.mean(), color='red', linestyle='--',
                linewidth=2, label=f'Mean: ${train_el.mean():.0f}')
axes[1].legend()

plt.suptitle('Expected Loss Analysis - EL = PD x LGD x EAD',
             fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('expected_loss.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
scenarios = {
    'Base Case':    {'pd_multiplier': 1.0, 'lgd_multiplier': 1.0},
    'Mild Stress':  {'pd_multiplier': 1.5, 'lgd_multiplier': 1.2},
    'Severe Stress':{'pd_multiplier': 2.0, 'lgd_multiplier': 1.5},
    'Extreme Stress':{'pd_multiplier': 3.0, 'lgd_multiplier': 2.0},
}

stress_results = []
base_el = train_el.sum()

for scenario, params in scenarios.items():
    stressed_pd  = np.clip(train_pd * params['pd_multiplier'], 0, 1)
    stressed_lgd = min(mean_lgd * params['lgd_multiplier'], 1)
    stressed_el  = stressed_pd * stressed_lgd * mean_ead_rate * \
                   train['loan_amnt'].fillna(train['loan_amnt'].median())
    
    stress_results.append({
        'Scenario': scenario,
        'PD Multiplier': params['pd_multiplier'],
        'LGD Multiplier': params['lgd_multiplier'],
        'Total EL ($M)': stressed_el.sum() / 1e6,
        'EL Increase (%)': (stressed_el.sum() / base_el - 1) * 100
    })

stress_df = pd.DataFrame(stress_results)
print("=== Stress Test Results ===")
print(stress_df.to_string(index=False))

fig, ax = plt.subplots(figsize=(10, 5))
colors = ['steelblue', 'orange', 'tomato', 'darkred']
bars = ax.bar(stress_df['Scenario'], stress_df['Total EL ($M)'],
              color=colors, edgecolor='black', linewidth=0.8)
ax.set_ylabel('Total Expected Loss ($M)', fontsize=12)
ax.set_title('Stress Test - Expected Loss Under Different Scenarios',
             fontsize=13, fontweight='bold')
for bar, val in zip(bars, stress_df['Total EL ($M)']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
            f'${val:.1f}M', ha='center', fontsize=10)
plt.tight_layout()
plt.savefig('stress_test.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
with open('../data/lgd_stage1_model.pkl', 'wb') as f:
    pickle.dump(lgd_stage1, f)

with open('../data/lgd_stage2_model.pkl', 'wb') as f:
    pickle.dump(lgd_stage2, f)

with open('../data/ead_model.pkl', 'wb') as f:
    pickle.dump(ead_model, f)

print("Saved:")
print("  ../data/lgd_stage1_model.pkl")
print("  ../data/lgd_stage2_model.pkl")
print("  ../data/ead_model.pkl")

## LGD, EAD and Expected Loss Summary

Complete Basel-style EL pipeline implemented.
