# Credit Portfolio Data Simulation

This notebook generates a realistic credit portfolio dataset for risk analysis and strategy simulation.

**Portfolio Characteristics:**
- Portfolio Size: £1.3B
- Customer Count: ~3M (downsized to 50k for demo)
- Product Type: Retail Credit Cards
- Geographic Coverage: UK Regions

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

fake = Faker('en_GB')
np.random.seed(42)
random.seed(42)
Faker.seed(42)

In [None]:
# Configuration
N_CUSTOMERS = 50000  # Reduced for demo (production: 3M)
PORTFOLIO_VALUE = 1.3e9  # £1.3B

# UK Regions for geographic distribution
UK_REGIONS = ['London', 'South East', 'North West', 'East of England', 'West Midlands', 
              'South West', 'Yorkshire', 'North East', 'East Midlands', 'Wales', 
              'Scotland', 'Northern Ireland']

# Income bands with realistic distributions
INCOME_BANDS = ['Low', 'Medium', 'High']
INCOME_WEIGHTS = [0.4, 0.45, 0.15]  # Realistic income distribution

print(f"Simulating portfolio with {N_CUSTOMERS:,} customers")
print(f"Target portfolio value: £{PORTFOLIO_VALUE/1e9:.1f}B")

In [None]:
def generate_customer_data(n_customers):
    """
    Generate realistic customer data with correlated risk characteristics
    """
    customers = []
    
    for i in range(n_customers):
        # Basic customer info
        customer_id = i + 1
        
        # Income band affects credit behavior
        income_band = np.random.choice(INCOME_BANDS, p=INCOME_WEIGHTS)
        
        # Application score (correlated with income)
        if income_band == 'High':
            application_score = np.random.normal(750, 100)
        elif income_band == 'Medium':
            application_score = np.random.normal(650, 120)
        else:  # Low
            application_score = np.random.normal(550, 140)
        
        application_score = np.clip(application_score, 300, 900)
        
        # Acceptance decision (score-based with some noise)
        acceptance_prob = 1 / (1 + np.exp(-(application_score - 500) / 100))
        acceptance_decision = 'Approved' if np.random.random() < acceptance_prob else 'Declined'
        
        # Only generate portfolio data for approved customers
        if acceptance_decision == 'Approved':
            # Credit limit (correlated with score and income)
            base_limit = (application_score - 300) * 50  # £300-30k range
            if income_band == 'High':
                credit_limit = base_limit * np.random.uniform(1.2, 2.0)
            elif income_band == 'Medium':
                credit_limit = base_limit * np.random.uniform(0.8, 1.5)
            else:
                credit_limit = base_limit * np.random.uniform(0.5, 1.2)
            
            credit_limit = np.clip(credit_limit, 500, 50000)
            
            # Balance and utilization (risk-correlated)
            # Lower scores tend to have higher utilization
            utilization_mean = max(0.1, 0.9 - (application_score - 300) / 600)
            utilization_rate = np.random.beta(2, 3) * utilization_mean
            utilization_rate = np.clip(utilization_rate, 0, 1)
            
            balance = credit_limit * utilization_rate
            
            # Repayment history (score-correlated)
            if application_score > 700:
                repayment_history = np.random.choice(['good', 'late', 'default'], p=[0.85, 0.13, 0.02])
            elif application_score > 600:
                repayment_history = np.random.choice(['good', 'late', 'default'], p=[0.70, 0.25, 0.05])
            else:
                repayment_history = np.random.choice(['good', 'late', 'default'], p=[0.55, 0.30, 0.15])
            
            # Delinquency status (based on repayment history)
            if repayment_history == 'good':
                delinquency_status = np.random.choice([0, 30], p=[0.95, 0.05])
            elif repayment_history == 'late':
                delinquency_status = np.random.choice([0, 30, 60], p=[0.6, 0.3, 0.1])
            else:  # default
                delinquency_status = np.random.choice([60, 90], p=[0.3, 0.7])
            
            # Marketing response (correlated with engagement)
            marketing_prob = 0.3 if income_band == 'High' else (0.2 if income_band == 'Medium' else 0.15)
            marketing_offer_response = 'Yes' if np.random.random() < marketing_prob else 'No'
        
        else:
            # Declined customers
            credit_limit = 0
            balance = 0
            utilization_rate = 0
            repayment_history = 'N/A'
            delinquency_status = 0
            marketing_offer_response = 'No'
        
        # Region (uniform distribution)
        region = np.random.choice(UK_REGIONS)
        
        customer = {
            'customer_id': customer_id,
            'application_score': round(application_score),
            'credit_limit': round(credit_limit, 2),
            'balance': round(balance, 2),
            'utilization_rate': round(utilization_rate * 100, 1),
            'repayment_history': repayment_history,
            'delinquency_status': delinquency_status,
            'income_band': income_band,
            'region': region,
            'marketing_offer_response': marketing_offer_response,
            'acceptance_decision': acceptance_decision
        }
        
        customers.append(customer)
        
        if (i + 1) % 10000 == 0:
            print(f"Generated {i + 1:,} customers...")
    
    return pd.DataFrame(customers)

print("Starting data generation...")
portfolio_df = generate_customer_data(N_CUSTOMERS)
print(f"\nGenerated {len(portfolio_df):,} total records")

In [None]:
# Portfolio Summary Statistics
approved_customers = portfolio_df[portfolio_df['acceptance_decision'] == 'Approved']
total_portfolio_value = approved_customers['balance'].sum()
total_credit_limit = approved_customers['credit_limit'].sum()
avg_utilization = approved_customers['utilization_rate'].mean()
delinquency_rate = (approved_customers['delinquency_status'] > 0).mean() * 100

print("=== PORTFOLIO SUMMARY ===")
print(f"Total Customers: {len(portfolio_df):,}")
print(f"Approved Customers: {len(approved_customers):,}")
print(f"Approval Rate: {len(approved_customers)/len(portfolio_df)*100:.1f}%")
print(f"Total Portfolio Balance: £{total_portfolio_value/1e6:.1f}M")
print(f"Total Credit Limits: £{total_credit_limit/1e6:.1f}M")
print(f"Average Utilization: {avg_utilization:.1f}%")
print(f"Portfolio Delinquency Rate: {delinquency_rate:.1f}%")

# Scale to target portfolio size if needed
scaling_factor = PORTFOLIO_VALUE / total_portfolio_value
print(f"\nScaling factor to reach £1.3B: {scaling_factor:.2f}x")

In [None]:
# Data Quality Checks
print("=== DATA QUALITY CHECKS ===")
print(f"Missing values:\n{portfolio_df.isnull().sum()}")
print(f"\nData types:\n{portfolio_df.dtypes}")
print(f"\nUnique values per column:")
for col in portfolio_df.columns:
    print(f"{col}: {portfolio_df[col].nunique()}")

In [None]:
# Visualize Portfolio Distribution
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Credit Portfolio Distribution Analysis', fontsize=16)

# Application Score Distribution
axes[0, 0].hist(portfolio_df['application_score'], bins=30, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Application Score Distribution')
axes[0, 0].set_xlabel('Application Score')
axes[0, 0].set_ylabel('Frequency')

# Credit Limit Distribution (Approved only)
axes[0, 1].hist(approved_customers['credit_limit'], bins=30, alpha=0.7, color='green', edgecolor='black')
axes[0, 1].set_title('Credit Limit Distribution (Approved)')
axes[0, 1].set_xlabel('Credit Limit (£)')
axes[0, 1].set_ylabel('Frequency')

# Utilization Rate Distribution
axes[0, 2].hist(approved_customers['utilization_rate'], bins=30, alpha=0.7, color='orange', edgecolor='black')
axes[0, 2].set_title('Utilization Rate Distribution')
axes[0, 2].set_xlabel('Utilization Rate (%)')
axes[0, 2].set_ylabel('Frequency')

# Income Band Distribution
income_counts = portfolio_df['income_band'].value_counts()
axes[1, 0].bar(income_counts.index, income_counts.values, alpha=0.7, color='purple')
axes[1, 0].set_title('Income Band Distribution')
axes[1, 0].set_xlabel('Income Band')
axes[1, 0].set_ylabel('Count')

# Delinquency Status Distribution
delinq_counts = approved_customers['delinquency_status'].value_counts().sort_index()
axes[1, 1].bar(delinq_counts.index.astype(str), delinq_counts.values, alpha=0.7, color='red')
axes[1, 1].set_title('Delinquency Status Distribution')
axes[1, 1].set_xlabel('Days Past Due')
axes[1, 1].set_ylabel('Count')

# Regional Distribution
region_counts = portfolio_df['region'].value_counts()
axes[1, 2].bar(range(len(region_counts)), region_counts.values, alpha=0.7, color='teal')
axes[1, 2].set_title('Regional Distribution')
axes[1, 2].set_xticks(range(len(region_counts)))
axes[1, 2].set_xticklabels(region_counts.index, rotation=45, ha='right')
axes[1, 2].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Risk Correlation Analysis
correlation_df = approved_customers[['application_score', 'credit_limit', 'balance', 'utilization_rate']].copy()
correlation_df['delinquency_flag'] = (approved_customers['delinquency_status'] > 0).astype(int)

plt.figure(figsize=(10, 8))
correlation_matrix = correlation_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, linewidths=0.5)
plt.title('Risk Factor Correlation Matrix')
plt.tight_layout()
plt.show()

print("Key Correlations:")
print(f"Application Score vs Delinquency: {correlation_matrix.loc['application_score', 'delinquency_flag']:.3f}")
print(f"Utilization vs Delinquency: {correlation_matrix.loc['utilization_rate', 'delinquency_flag']:.3f}")

In [None]:
# Save the dataset
output_path = '../data/credit_portfolio.csv'
portfolio_df.to_csv(output_path, index=False)
print(f"Portfolio dataset saved to: {output_path}")
print(f"Dataset shape: {portfolio_df.shape}")
print(f"File size: {portfolio_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

# Display sample records
print("\nSample records:")
display(portfolio_df.head(10))

print("\n=== DATA GENERATION COMPLETE ===")
print(f"✓ Generated {len(portfolio_df):,} customer records")
print(f"✓ Portfolio balance: £{approved_customers['balance'].sum()/1e6:.1f}M")
print(f"✓ Realistic risk correlations established")
print(f"✓ Dataset ready for risk analysis and strategy simulation")