# Credit Portfolio Risk Analysis

This notebook performs comprehensive risk analysis including:
- Delinquency roll-rate analysis
- Expected Loss calculation (PD × LGD × EAD)
- Predictive modeling for delinquency
- Early warning indicators
- Risk trend visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load the portfolio data
portfolio_df = pd.read_csv('../data/credit_portfolio.csv')
print(f"Loaded portfolio with {len(portfolio_df):,} customers")

# Focus on approved customers for risk analysis
approved_df = portfolio_df[portfolio_df['acceptance_decision'] == 'Approved'].copy()
print(f"Approved customers for analysis: {len(approved_df):,}")

# Display basic portfolio metrics
print(f"\n=== PORTFOLIO OVERVIEW ===")
print(f"Total portfolio balance: £{approved_df['balance'].sum()/1e6:.1f}M")
print(f"Average credit limit: £{approved_df['credit_limit'].mean():,.0f}")
print(f"Average utilization: {approved_df['utilization_rate'].mean():.1f}%")
print(f"Delinquency rate: {(approved_df['delinquency_status'] > 0).mean()*100:.1f}%")

## 1. Delinquency Roll-Rate Analysis

Roll rates show the probability of accounts moving between delinquency buckets (Current → 30 → 60 → 90+ days past due).

In [None]:
def calculate_roll_rates(df):
    """
    Calculate delinquency roll rates - probability of moving between buckets
    """
    # Simulate historical data by creating previous month's status
    # This is a simplified version - in reality you'd have time series data
    np.random.seed(42)
    
    # Create synthetic previous month data
    df_prev = df.copy()
    
    # Simulate backwards transitions (some improvement)
    for idx in df_prev.index:
        current_status = df_prev.loc[idx, 'delinquency_status']
        if current_status == 30 and np.random.random() < 0.3:
            df_prev.loc[idx, 'prev_delinquency_status'] = 0
        elif current_status == 60 and np.random.random() < 0.4:
            df_prev.loc[idx, 'prev_delinquency_status'] = np.random.choice([0, 30])
        elif current_status == 90 and np.random.random() < 0.2:
            df_prev.loc[idx, 'prev_delinquency_status'] = np.random.choice([30, 60])
        else:
            # Most likely same or worse
            if current_status == 0:
                df_prev.loc[idx, 'prev_delinquency_status'] = 0
            else:
                df_prev.loc[idx, 'prev_delinquency_status'] = max(0, current_status - 30)
    
    # Create roll rate matrix
    roll_rate_df = df_prev.groupby(['prev_delinquency_status', 'delinquency_status']).size().unstack(fill_value=0)
    
    # Calculate percentages
    roll_rate_pct = roll_rate_df.div(roll_rate_df.sum(axis=1), axis=0) * 100
    
    return roll_rate_pct

# Calculate roll rates
roll_rates = calculate_roll_rates(approved_df)
print("=== DELINQUENCY ROLL RATES ===")
print("Rows = Previous Month | Columns = Current Month")
print(roll_rates.round(1))

In [None]:
# Visualize roll rate matrix
plt.figure(figsize=(10, 8))
sns.heatmap(roll_rates, annot=True, fmt='.1f', cmap='RdYlGn_r', 
            cbar_kws={'label': 'Roll Rate (%)'}, 
            xticklabels=['Current', '30 DPD', '60 DPD', '90+ DPD'],
            yticklabels=['Current', '30 DPD', '60 DPD', '90+ DPD'])
plt.title('Delinquency Roll Rate Matrix', fontsize=14, fontweight='bold')
plt.xlabel('Current Month Delinquency Status')
plt.ylabel('Previous Month Delinquency Status')
plt.tight_layout()
plt.show()

# Key roll rate insights
print("\n=== KEY ROLL RATE INSIGHTS ===")
current_to_30 = roll_rates.loc[0, 30] if 30 in roll_rates.columns and 0 in roll_rates.index else 0
dpd30_to_60 = roll_rates.loc[30, 60] if 30 in roll_rates.index and 60 in roll_rates.columns else 0
dpd60_to_90 = roll_rates.loc[60, 90] if 60 in roll_rates.index and 90 in roll_rates.columns else 0

print(f"Current → 30 DPD: {current_to_30:.1f}%")
print(f"30 DPD → 60 DPD: {dpd30_to_60:.1f}%")
print(f"60 DPD → 90+ DPD: {dpd60_to_90:.1f}%")

## 2. Expected Loss Calculation (PD × LGD × EAD)

**Components:**
- **PD (Probability of Default)**: Likelihood of default within 12 months
- **LGD (Loss Given Default)**: % of exposure lost if default occurs
- **EAD (Exposure at Default)**: Amount exposed when default occurs

In [None]:
def calculate_pd_lgd_ead(df):
    """
    Calculate PD, LGD, and EAD for each customer segment
    """
    df = df.copy()
    
    # 1. Probability of Default (PD) - based on current risk indicators
    # Score-based PD using logistic function
    df['pd_score_based'] = 1 / (1 + np.exp((df['application_score'] - 500) / 100))
    
    # Adjust PD based on current delinquency status
    pd_adjustments = {0: 1.0, 30: 3.0, 60: 8.0, 90: 25.0}
    df['delinq_multiplier'] = df['delinquency_status'].map(pd_adjustments)
    df['pd_12m'] = np.clip(df['pd_score_based'] * df['delinq_multiplier'] / 100, 0.001, 0.5)
    
    # 2. Loss Given Default (LGD) - industry benchmarks adjusted for portfolio
    # Higher utilization typically means higher LGD
    base_lgd = 0.45  # 45% average for unsecured credit cards
    utilization_impact = df['utilization_rate'] / 100 * 0.3  # Up to 30% impact
    df['lgd'] = np.clip(base_lgd + utilization_impact, 0.2, 0.8)
    
    # 3. Exposure at Default (EAD) - current balance + potential drawdown
    # EAD includes current balance + estimated additional drawdown before default
    available_credit = df['credit_limit'] - df['balance']
    drawdown_rate = 0.5  # Assume 50% of available credit drawn before default
    df['ead'] = df['balance'] + (available_credit * drawdown_rate)
    
    # 4. Expected Loss = PD × LGD × EAD
    df['expected_loss'] = df['pd_12m'] * df['lgd'] * df['ead']
    
    return df

# Calculate risk metrics
risk_df = calculate_pd_lgd_ead(approved_df)

# Portfolio-level risk summary
total_ead = risk_df['ead'].sum()
total_expected_loss = risk_df['expected_loss'].sum()
portfolio_loss_rate = total_expected_loss / total_ead * 100
weighted_avg_pd = (risk_df['pd_12m'] * risk_df['ead']).sum() / total_ead * 100
weighted_avg_lgd = (risk_df['lgd'] * risk_df['ead']).sum() / total_ead * 100

print("=== PORTFOLIO RISK METRICS ===")
print(f"Total EAD: £{total_ead/1e6:.1f}M")
print(f"Total Expected Loss: £{total_expected_loss/1e6:.2f}M")
print(f"Portfolio Loss Rate: {portfolio_loss_rate:.2f}%")
print(f"Weighted Average PD: {weighted_avg_pd:.2f}%")
print(f"Weighted Average LGD: {weighted_avg_lgd:.1f}%")

In [None]:
# Risk segmentation analysis
def analyze_risk_segments(df):
    """
    Analyze risk by different customer segments
    """
    segments = {}
    
    # By Income Band
    income_risk = df.groupby('income_band').agg({
        'pd_12m': 'mean',
        'lgd': 'mean', 
        'expected_loss': ['sum', 'mean'],
        'ead': 'sum',
        'customer_id': 'count'
    }).round(4)
    income_risk.columns = ['Avg_PD_%', 'Avg_LGD_%', 'Total_EL_£', 'Avg_EL_£', 'Total_EAD_£', 'Customer_Count']
    income_risk['Loss_Rate_%'] = (income_risk['Total_EL_£'] / income_risk['Total_EAD_£'] * 100).round(2)
    
    # By Delinquency Status
    delinq_risk = df.groupby('delinquency_status').agg({
        'pd_12m': 'mean',
        'lgd': 'mean',
        'expected_loss': ['sum', 'mean'], 
        'ead': 'sum',
        'customer_id': 'count'
    }).round(4)
    delinq_risk.columns = ['Avg_PD_%', 'Avg_LGD_%', 'Total_EL_£', 'Avg_EL_£', 'Total_EAD_£', 'Customer_Count']
    delinq_risk['Loss_Rate_%'] = (delinq_risk['Total_EL_£'] / delinq_risk['Total_EAD_£'] * 100).round(2)
    
    # By Region
    region_risk = df.groupby('region').agg({
        'pd_12m': 'mean',
        'expected_loss': 'sum',
        'ead': 'sum', 
        'customer_id': 'count'
    }).round(4)
    region_risk['Loss_Rate_%'] = (region_risk['expected_loss'] / region_risk['ead'] * 100).round(2)
    region_risk = region_risk.sort_values('Loss_Rate_%', ascending=False)
    
    return income_risk, delinq_risk, region_risk

income_risk, delinq_risk, region_risk = analyze_risk_segments(risk_df)

print("\n=== RISK BY INCOME BAND ===")
print(income_risk)

print("\n=== RISK BY DELINQUENCY STATUS ===")
print(delinq_risk)

print("\n=== TOP 5 HIGHEST RISK REGIONS ===")
print(region_risk.head())

In [None]:
# Visualize risk distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Portfolio Risk Analysis', fontsize=16, fontweight='bold')

# PD Distribution
axes[0, 0].hist(risk_df['pd_12m'] * 100, bins=50, alpha=0.7, color='red', edgecolor='black')
axes[0, 0].set_title('Probability of Default Distribution')
axes[0, 0].set_xlabel('12-Month PD (%)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(weighted_avg_pd, color='black', linestyle='--', label=f'Portfolio Avg: {weighted_avg_pd:.2f}%')
axes[0, 0].legend()

# LGD Distribution
axes[0, 1].hist(risk_df['lgd'] * 100, bins=30, alpha=0.7, color='orange', edgecolor='black')
axes[0, 1].set_title('Loss Given Default Distribution')
axes[0, 1].set_xlabel('LGD (%)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(weighted_avg_lgd, color='black', linestyle='--', label=f'Portfolio Avg: {weighted_avg_lgd:.1f}%')
axes[0, 1].legend()

# Expected Loss by Income Band
income_el = income_risk['Total_EL_£'] / 1000  # Convert to thousands
axes[1, 0].bar(income_el.index, income_el.values, alpha=0.7, color='purple')
axes[1, 0].set_title('Expected Loss by Income Band')
axes[1, 0].set_xlabel('Income Band')
axes[1, 0].set_ylabel('Total Expected Loss (£000s)')
for i, v in enumerate(income_el.values):
    axes[1, 0].text(i, v + max(income_el.values) * 0.01, f'£{v:.0f}k', ha='center')

# Regional Risk (Top 8)
top_regions = region_risk.head(8)
axes[1, 1].barh(range(len(top_regions)), top_regions['Loss_Rate_%'], alpha=0.7, color='teal')
axes[1, 1].set_title('Top Risk Regions (Loss Rate %)')
axes[1, 1].set_xlabel('Loss Rate (%)')
axes[1, 1].set_yticks(range(len(top_regions)))
axes[1, 1].set_yticklabels(top_regions.index)

plt.tight_layout()
plt.show()

## 3. Predictive Modeling for Delinquency Risk

Build machine learning models to predict future delinquency risk.

In [None]:
# Prepare data for modeling
def prepare_modeling_data(df):
    """
    Prepare features and target for delinquency prediction
    """
    model_df = df.copy()
    
    # Create binary delinquency target
    model_df['is_delinquent'] = (model_df['delinquency_status'] > 0).astype(int)
    
    # Feature engineering
    model_df['limit_to_income_proxy'] = model_df['credit_limit'] / (model_df['application_score'] + 1)
    model_df['balance_to_limit'] = model_df['balance'] / (model_df['credit_limit'] + 1)
    
    # Encode categorical variables
    le_income = LabelEncoder()
    le_repayment = LabelEncoder()
    le_region = LabelEncoder()
    
    model_df['income_band_encoded'] = le_income.fit_transform(model_df['income_band'])
    model_df['repayment_history_encoded'] = le_repayment.fit_transform(model_df['repayment_history'])
    model_df['region_encoded'] = le_region.fit_transform(model_df['region'])
    
    # Select features for modeling
    feature_cols = [
        'application_score', 'credit_limit', 'balance', 'utilization_rate',
        'income_band_encoded', 'repayment_history_encoded', 'region_encoded',
        'limit_to_income_proxy', 'balance_to_limit'
    ]
    
    X = model_df[feature_cols]
    y = model_df['is_delinquent']
    
    return X, y, feature_cols, model_df

X, y, feature_cols, model_df = prepare_modeling_data(risk_df)

print(f"Modeling dataset shape: {X.shape}")
print(f"Delinquency rate: {y.mean():.3f}")
print(f"Features: {feature_cols}")

In [None]:
# Split data and train models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    auc = roc_auc_score(y_test, y_pred_proba)
    
    model_results[name] = {
        'model': model,
        'auc': auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} AUC: {auc:.3f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
# ROC Curves Comparison
plt.figure(figsize=(12, 8))

for name, results in model_results.items():
    fpr, tpr, _ = roc_curve(y_test, results['probabilities'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {results['auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Delinquency Prediction Models')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Select best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['auc'])
best_model = model_results[best_model_name]['model']
print(f"\nBest performing model: {best_model_name} (AUC: {model_results[best_model_name]['auc']:.3f})")

In [None]:
# Feature Importance Analysis
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_importance)), feature_importance['importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("Top 5 Most Important Features:")
    print(feature_importance.head())

elif hasattr(best_model, 'coef_'):
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'coefficient': best_model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=False)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'blue' for x in feature_importance['coefficient']]
    plt.barh(range(len(feature_importance)), feature_importance['coefficient'], color=colors, alpha=0.7)
    plt.yticks(range(len(feature_importance)), feature_importance['feature'])
    plt.xlabel('Coefficient Value')
    plt.title(f'Feature Coefficients - {best_model_name}')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("Feature Coefficients (sorted by absolute value):")
    print(feature_importance)

## 4. Early Warning Indicators

Identify high-risk customers before they become delinquent.

In [None]:
# Generate risk scores for all customers
risk_scores = best_model.predict_proba(X)[:, 1]
model_df['risk_score'] = risk_scores

# Define risk segments based on score percentiles
risk_thresholds = {
    'Low Risk': np.percentile(risk_scores, 75),
    'Medium Risk': np.percentile(risk_scores, 90), 
    'High Risk': np.percentile(risk_scores, 95),
    'Very High Risk': 1.0
}

def assign_risk_segment(score):
    if score <= risk_thresholds['Low Risk']:
        return 'Low Risk'
    elif score <= risk_thresholds['Medium Risk']:
        return 'Medium Risk'
    elif score <= risk_thresholds['High Risk']:
        return 'High Risk'
    else:
        return 'Very High Risk'

model_df['risk_segment'] = model_df['risk_score'].apply(assign_risk_segment)

# Early warning analysis
early_warning = model_df.groupby('risk_segment').agg({
    'customer_id': 'count',
    'is_delinquent': ['sum', 'mean'],
    'balance': 'sum',
    'expected_loss': 'sum',
    'utilization_rate': 'mean',
    'application_score': 'mean'
}).round(3)

early_warning.columns = ['Customer_Count', 'Delinquent_Count', 'Delinquency_Rate', 
                        'Total_Balance_£', 'Total_Expected_Loss_£', 'Avg_Utilization_%', 'Avg_Score']

# Calculate concentration metrics
early_warning['Portfolio_%'] = (early_warning['Customer_Count'] / len(model_df) * 100).round(1)
early_warning['Balance_%'] = (early_warning['Total_Balance_£'] / early_warning['Total_Balance_£'].sum() * 100).round(1)
early_warning['Loss_%'] = (early_warning['Total_Expected_Loss_£'] / early_warning['Total_Expected_Loss_£'].sum() * 100).round(1)

print("=== EARLY WARNING RISK SEGMENTS ===")
print(early_warning)

print(f"\n=== HIGH RISK ALERTS ===")
high_risk_customers = len(model_df[model_df['risk_segment'].isin(['High Risk', 'Very High Risk'])])
high_risk_balance = model_df[model_df['risk_segment'].isin(['High Risk', 'Very High Risk'])]['balance'].sum()

print(f"High Risk Customers: {high_risk_customers:,} ({high_risk_customers/len(model_df)*100:.1f}% of portfolio)")
print(f"High Risk Balance: £{high_risk_balance/1e6:.1f}M ({high_risk_balance/model_df['balance'].sum()*100:.1f}% of portfolio)")

In [None]:
# Top 10 highest risk customers for immediate attention
high_risk_customers_detail = model_df[model_df['risk_segment'] == 'Very High Risk'].nlargest(10, 'risk_score')
alert_customers = high_risk_customers_detail[[
    'customer_id', 'risk_score', 'application_score', 'credit_limit', 'balance', 
    'utilization_rate', 'delinquency_status', 'expected_loss'
]].copy()

print("\n=== TOP 10 CUSTOMERS REQUIRING IMMEDIATE ATTENTION ===")
print(alert_customers.round(3))

# Risk score distribution visualization
plt.figure(figsize=(14, 10))

plt.subplot(2, 2, 1)
plt.hist(model_df['risk_score'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(risk_thresholds['Medium Risk'], color='yellow', linestyle='--', label='Medium Risk Threshold')
plt.axvline(risk_thresholds['High Risk'], color='orange', linestyle='--', label='High Risk Threshold')
plt.axvline(risk_thresholds['Very High Risk'], color='red', linestyle='--', label='Very High Risk Threshold')
plt.xlabel('Risk Score')
plt.ylabel('Frequency')
plt.title('Risk Score Distribution')
plt.legend()

plt.subplot(2, 2, 2)
segment_counts = model_df['risk_segment'].value_counts()
plt.pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Portfolio Risk Segmentation')

plt.subplot(2, 2, 3)
risk_vs_delinq = model_df.groupby('risk_segment')['is_delinquent'].mean() * 100
colors = ['green', 'yellow', 'orange', 'red']
plt.bar(risk_vs_delinq.index, risk_vs_delinq.values, color=colors, alpha=0.7)
plt.xlabel('Risk Segment')
plt.ylabel('Delinquency Rate (%)')
plt.title('Delinquency Rate by Risk Segment')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
loss_by_segment = model_df.groupby('risk_segment')['expected_loss'].sum() / 1000
plt.bar(loss_by_segment.index, loss_by_segment.values, color=colors, alpha=0.7)
plt.xlabel('Risk Segment')
plt.ylabel('Expected Loss (£000s)')
plt.title('Expected Loss by Risk Segment')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 5. Risk Monitoring & Trends

Simulate portfolio trends and stress scenarios.

In [None]:
# Simulate portfolio evolution over 12 months
def simulate_portfolio_trends(df, months=12):
    """
    Simulate how the portfolio might evolve over time
    """
    trends = []
    current_portfolio = df.copy()
    
    for month in range(months):
        # Monthly statistics
        month_stats = {
            'month': month + 1,
            'total_customers': len(current_portfolio),
            'total_balance': current_portfolio['balance'].sum(),
            'delinquency_rate': (current_portfolio['delinquency_status'] > 0).mean() * 100,
            'avg_utilization': current_portfolio['utilization_rate'].mean(),
            'expected_loss': current_portfolio['expected_loss'].sum(),
            'high_risk_customers': len(current_portfolio[current_portfolio['risk_segment'].isin(['High Risk', 'Very High Risk'])])
        }
        trends.append(month_stats)
        
        # Simulate monthly changes (simplified)
        # Some customers improve, some deteriorate
        np.random.seed(42 + month)
        
        # Economic cycle impact (simulate recession in months 6-9)
        if 6 <= month <= 9:
            stress_factor = 1.5  # 50% increase in risk during recession
        else:
            stress_factor = 1.0
        
        # Update delinquency status based on risk scores
        for idx in current_portfolio.index:
            risk_score = current_portfolio.loc[idx, 'risk_score'] * stress_factor
            current_status = current_portfolio.loc[idx, 'delinquency_status']
            
            # Higher risk customers more likely to become delinquent
            if risk_score > 0.3 and current_status == 0:
                if np.random.random() < risk_score / 5:  # Probabilistic transition
                    current_portfolio.loc[idx, 'delinquency_status'] = 30
            elif current_status == 30 and np.random.random() < risk_score / 3:
                current_portfolio.loc[idx, 'delinquency_status'] = 60
            elif current_status == 60 and np.random.random() < risk_score / 2:
                current_portfolio.loc[idx, 'delinquency_status'] = 90
        
        # Recalculate expected loss with new delinquency status
        current_portfolio = calculate_pd_lgd_ead(current_portfolio)
    
    return pd.DataFrame(trends)

# Generate trends
portfolio_trends = simulate_portfolio_trends(model_df)

print("=== SIMULATED PORTFOLIO TRENDS (12 MONTHS) ===")
print(portfolio_trends.round(2))

In [None]:
# Visualize portfolio trends
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Portfolio Risk Trends - 12 Month Simulation', fontsize=16, fontweight='bold')

# Delinquency Rate Trend
axes[0, 0].plot(portfolio_trends['month'], portfolio_trends['delinquency_rate'], 
                marker='o', linewidth=2, color='red')
axes[0, 0].fill_between([6, 9], 0, portfolio_trends['delinquency_rate'].max() * 1.1, 
                        alpha=0.2, color='gray', label='Recession Period')
axes[0, 0].set_title('Delinquency Rate Trend')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Delinquency Rate (%)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Expected Loss Trend
axes[0, 1].plot(portfolio_trends['month'], portfolio_trends['expected_loss'] / 1000, 
                marker='o', linewidth=2, color='orange')
axes[0, 1].fill_between([6, 9], 0, portfolio_trends['expected_loss'].max() / 1000 * 1.1, 
                        alpha=0.2, color='gray', label='Recession Period')
axes[0, 1].set_title('Expected Loss Trend')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Expected Loss (£000s)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Portfolio Balance Trend
axes[1, 0].plot(portfolio_trends['month'], portfolio_trends['total_balance'] / 1e6, 
                marker='o', linewidth=2, color='blue')
axes[1, 0].set_title('Portfolio Balance Trend')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Total Balance (£M)')
axes[1, 0].grid(True, alpha=0.3)

# High Risk Customers Trend
axes[1, 1].plot(portfolio_trends['month'], portfolio_trends['high_risk_customers'], 
                marker='o', linewidth=2, color='purple')
axes[1, 1].fill_between([6, 9], 0, portfolio_trends['high_risk_customers'].max() * 1.1, 
                        alpha=0.2, color='gray', label='Recession Period')
axes[1, 1].set_title('High Risk Customers Trend')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('High Risk Customer Count')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Key insights from trends
baseline_delinq = portfolio_trends.loc[0, 'delinquency_rate']
peak_delinq = portfolio_trends['delinquency_rate'].max()
stress_increase = ((peak_delinq - baseline_delinq) / baseline_delinq * 100)

baseline_loss = portfolio_trends.loc[0, 'expected_loss']
peak_loss = portfolio_trends['expected_loss'].max()
loss_increase = ((peak_loss - baseline_loss) / baseline_loss * 100)

print(f"\n=== STRESS TEST INSIGHTS ===")
print(f"Baseline Delinquency Rate: {baseline_delinq:.2f}%")
print(f"Peak Stress Delinquency Rate: {peak_delinq:.2f}%")
print(f"Stress Impact on Delinquency: +{stress_increase:.1f}%")
print(f"Stress Impact on Expected Loss: +{loss_increase:.1f}%")

In [None]:
# Save risk analysis results
# Risk-enhanced dataset
model_df[[
    'customer_id', 'application_score', 'credit_limit', 'balance', 'utilization_rate',
    'income_band', 'region', 'delinquency_status', 'repayment_history',
    'pd_12m', 'lgd', 'ead', 'expected_loss', 'risk_score', 'risk_segment'
]].to_csv('../data/portfolio_with_risk_metrics.csv', index=False)

# Risk segment summary
early_warning.to_csv('../data/risk_segment_summary.csv')

# Portfolio trends
portfolio_trends.to_csv('../data/portfolio_trends_simulation.csv', index=False)

print("=== RISK ANALYSIS COMPLETE ===")
print(f"✓ Delinquency roll rates calculated")
print(f"✓ PD/LGD/EAD metrics computed for {len(model_df):,} customers")
print(f"✓ ML models trained - Best: {best_model_name} (AUC: {model_results[best_model_name]['auc']:.3f})")
print(f"✓ {high_risk_customers:,} high-risk customers identified")
print(f"✓ 12-month portfolio trends simulated")
print(f"✓ Risk analysis results saved to ../data/")

print(f"\n=== KEY RISK METRICS ===")
print(f"Portfolio Loss Rate: {portfolio_loss_rate:.2f}%")
print(f"Expected Annual Loss: £{total_expected_loss/1e6:.2f}M")
print(f"Stress Test Peak Loss: £{peak_loss/1e6:.2f}M (+{loss_increase:.1f}%)")