In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

In [2]:
# Load the dataset
file_path = 'b_df_3_encoded_loan_data.csv'  # Make sure this file is accessible in the same directory or update the path accordingly
df = pd.read_csv(file_path)

In [3]:
# Calculate total amount to be paid over loan term including interest for the entire dataset
df['total_amount_due'] = df['instalment'] * df['term']

# Calculate total payments received to date for the entire dataset
if 'total_payment' in df.columns:
    df['total_payments_received'] = df['total_payment']
elif 'last_payment_amount' in df.columns:
    df['total_payments_received'] = df['last_payment_amount']
else:
    df['total_payments_received'] = 0  # If neither column exists, set default to 0

# Filtering out charged off loans based on 'loan_status' column (if available)
if 'loan_status' in df.columns:
    df_filtered = df[df['loan_status'] != 1]
else:
    df_filtered = df  # If 'loan_status' is not available, use entire dataset

# Task 1: Current state of the loans
# Calculate the percentage of loans recovered compared to the total amount due
df_filtered['percentage_recovered'] = (df_filtered['total_payments_received'] / df_filtered['total_amount_due']) * 100

# Aggregating the percentage of loans recovered for the entire dataset
total_recovered = df_filtered['total_payments_received'].sum()
total_due = df_filtered['total_amount_due'].sum()
overall_percentage_recovered = (total_recovered / total_due) * 100

print(f"Overall percentage of loans recovered: {overall_percentage_recovered:.2f}%")

# Converting dates to datetime format
df_filtered['issue_date'] = pd.to_datetime(df_filtered['issue_date'], format='%Y-%m-%d')
df_filtered['last_payment_date'] = pd.to_datetime(df_filtered['last_payment_date'], format='%Y-%m-%d')

# Calculate the number of months since the issue date
df_filtered['months_so_far'] = ((df_filtered['last_payment_date'] - df_filtered['issue_date']).dt.days / 30).fillna(0).astype(int)

# Calculate months remaining in the loan term
df_filtered['months_remaining'] = df_filtered['term'] - df_filtered['months_so_far']

# Calculating expected payment over the next 6 months considering remaining term
def calculate_six_months_payment(row):
    if row['months_remaining'] <= 0:
        return 0  # Loan is fully paid off
    elif row['months_remaining'] < 6:
        return row['instalment'] * row['months_remaining']  # Less than 6 months remaining
    else:
        return row['instalment'] * 6  # At least 6 months remaining

# Apply the calculation
df_filtered['six_months_payment'] = df_filtered.apply(calculate_six_months_payment, axis=1)

# Aggregating the total expected payment over the next 6 months
total_six_months_payment = df_filtered['six_months_payment'].sum()
print(f"Total expected payment in the next 6 months: ${total_six_months_payment:,.2f}")

# Task 2: Percentage of loans that have been a loss to the company
# Loans marked as 'Charged Off' in the loan_status column represent a loss to the company.
# Assuming 'Charged Off' was encoded as 1 based on the transformations observed in the notebook.

# Filtering the charged off loans
charged_off_loans = df[df['loan_status'] == 1]

# Calculate the percentage of loans that are charged off
num_charged_off = charged_off_loans.shape[0]
total_loans = df.shape[0]
percentage_charged_off = (num_charged_off / total_loans) * 100

# Calculate the total amount that was paid towards these loans before being charged off
total_paid_charged_off = charged_off_loans['total_payments_received'].sum()

print(f"\nPercentage of loans that have been charged off: {percentage_charged_off:.2f}%")
print(f"Total amount paid towards charged off loans: ${total_paid_charged_off:,.2f}")

# Task 3: Calculating projected loss
# Calculate the expected loss of the loans marked as Charged Off.
# Calculate the loss in revenue these loans would have generated for the company if they had finished their term.

# Calculate the total expected amount due for the charged off loans
charged_off_loans = charged_off_loans.copy()  # Avoid SettingWithCopyWarning by working with a copy
charged_off_loans['expected_amount_due'] = charged_off_loans['total_amount_due']

# Calculate the expected loss by subtracting the amount paid from the total amount due
charged_off_loans['expected_loss'] = charged_off_loans['expected_amount_due'] - charged_off_loans['total_payments_received']

# Calculate the total expected loss
total_expected_loss = charged_off_loans['expected_loss'].sum()
print(f"\nTotal expected loss from charged off loans: ${total_expected_loss:,.2f}")

# Task 4: Possible loss
# Customers who are currently behind with their loan payments represent a risk to company revenue.
# Assuming 'Late' is encoded as 2 in the loan_status column.

# Filtering loans that are late
late_loans = df[df['loan_status'] == 2]

# Calculate the percentage of loans that are late
num_late_loans = late_loans.shape[0]
percentage_late_loans = (num_late_loans / total_loans) * 100

# Calculate the total amount that was paid towards these loans so far
total_paid_late_loans = late_loans['total_payments_received'].sum()

# Calculate the total expected amount due for the late loans
late_loans = late_loans.copy()
late_loans['expected_amount_due'] = late_loans['total_amount_due']

# Calculate the expected loss by subtracting the amount paid from the total amount due
late_loans['expected_loss'] = late_loans['expected_amount_due'] - late_loans['total_payments_received']

# Calculate the total expected loss for late loans
total_expected_loss_late = late_loans['expected_loss'].sum()

print(f"\nPercentage of loans that are currently late: {percentage_late_loans:.2f}%")
print(f"Total amount paid towards late loans: ${total_paid_late_loans:,.2f}")
print(f"Total expected loss from late loans: ${total_expected_loss_late:,.2f}")

# Task 4: Combined Loss Analysis
# Calculate the combined percentage of total expected revenue loss from both charged off and late loans

# Calculate the total expected revenue for the entire dataset
total_expected_revenue = df['total_amount_due'].sum()

# Calculate the combined expected loss
combined_expected_loss = total_expected_loss + total_expected_loss_late

# Calculate the combined percentage of total expected revenue loss
combined_percentage_loss = (combined_expected_loss / total_expected_revenue) * 100

print(f"\nCombined expected loss from charged off and late loans: ${combined_expected_loss:,.2f}")
print(f"Combined percentage of total expected revenue loss: {combined_percentage_loss:.2f}%")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['percentage_recovered'] = (df_filtered['total_payments_received'] / df_filtered['total_amount_due']) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['issue_date'] = pd.to_datetime(df_filtered['issue_date'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Overall percentage of loans recovered: 79.02%
Total expected payment in the next 6 months: $66,356,966.27

Percentage of loans that have been charged off: 35.53%
Total amount paid towards charged off loans: $212,459,097.82

Total expected loss from charged off loans: $167,704,298.39

Percentage of loans that are currently late: 0.10%
Total amount paid towards late loans: $497,390.48
Total expected loss from late loans: $553,295.37

Combined expected loss from charged off and late loans: $168,257,593.76
Combined percentage of total expected revenue loss: 17.89%


In [4]:
# Visualization using Plotly with gradient colors using coloraxis
# Plotly for Percentage of Loans Recovered
fig1 = px.histogram(df, x='percentage_recovered', nbins=30, title='Distribution of Loan Recovery Percentage',
                   labels={'percentage_recovered': 'Percentage of Loan Recovered'}, template='plotly_white')
fig1.update_traces(marker=dict(colorscale='Blues', coloraxis='coloraxis'))
fig1.update_layout(coloraxis=dict(colorscale='Blues', colorbar_title='Recovery Percentage'))
fig1.show()

# Plotly for Projected Loss for Charged Off Loans
fig2 = px.histogram(charged_off_loans, x='expected_loss', nbins=30, title='Projected Loss for Charged Off Loans',
                   labels={'expected_loss': 'Projected Loss ($)'}, template='plotly_white')
fig2.update_traces(marker=dict(colorscale='Blues', coloraxis='coloraxis'))
fig2.update_layout(coloraxis=dict(colorscale='Blues', colorbar_title='Loss Amount'))
fig2.show()

# Plotly for Projected Loss for Late Loans
fig3 = px.histogram(late_loans, x='expected_loss', nbins=30, title='Projected Loss for Late Loans',
                   labels={'expected_loss': 'Projected Loss ($)'}, template='plotly_white')
fig3.update_traces(marker=dict(colorscale='Blues', coloraxis='coloraxis'))
fig3.update_layout(coloraxis=dict(colorscale='Blues', colorbar_title='Loss Amount'))
fig3.show()

# Plotly for Combined Projected Loss for Charged Off and Late Loans
combined_loss_df = pd.concat([charged_off_loans[['expected_loss']], late_loans[['expected_loss']]])
fig4 = px.histogram(combined_loss_df, x='expected_loss', nbins=30, title='Combined Projected Loss for Charged Off and Late Loans',
                   labels={'expected_loss': 'Combined Projected Loss ($)'}, template='plotly_white')
fig4.update_traces(marker=dict(colorscale='Blues', coloraxis='coloraxis'))
fig4.update_layout(coloraxis=dict(colorscale='Blues', colorbar_title='Combined Loss Amount'))
fig4.show()

# Plotly for Expected Payments in the Next 6 Months
fig5 = px.histogram(df, x='six_months_payment', nbins=30, title='Expected Payments in the Next 6 Months',
                   labels={'six_months_payment': 'Expected Payment in Next 6 Months ($)'}, template='plotly_white')
fig5.update_traces(marker=dict(colorscale='Blues', coloraxis='coloraxis'))
fig5.update_layout(coloraxis=dict(colorscale='Blues', colorbar_title='Payments'))
fig5.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['id', 'member_id', 'loan_amount', 'funded_amount', 'funded_amount_inv', 'term', 'int_rate', 'instalment', 'grade', 'sub_grade', 'employment_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_date', 'loan_status', 'payment_plan', 'purpose', 'dti', 'delinq_2yrs', 'earliest_credit_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_accounts', 'total_accounts', 'out_prncp', 'out_prncp_inv', 'total_payment', 'total_payment_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_payment_date', 'last_payment_amount', 'last_credit_pull_date', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'total_amount_due', 'total_payments_received'] but received: percentage_recovered

In [None]:
# Create subsets for different loan statuses
charged_off_loans = df[df['loan_status'] == 1]
late_loans = df[df['loan_status'] == 2]
current_loans = df[df['loan_status'] == 0]  # 0 is for current loans

# Analysis 1: Grade Distribution
fig1 = px.histogram(df, x='grade', color='loan_status',
                   title='Loan Grade Distribution by Status',
                   labels={'grade': 'Loan Grade', 'count': 'Number of Loans'},
                   barmode='group',
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig1.show()

# Analysis 2: Purpose Distribution
fig2 = px.histogram(df, x='purpose', color='loan_status',
                   title='Loan Purpose Distribution by Status',
                   labels={'purpose': 'Loan Purpose', 'count': 'Number of Loans'},
                   barmode='group',
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig2.show()

# Analysis 3: Home Ownership Impact
fig3 = px.histogram(df, x='home_ownership', color='loan_status',
                   title='Home Ownership Distribution by Loan Status',
                   labels={'home_ownership': 'Home Ownership Status', 'count': 'Number of Loans'},
                   barmode='group',
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig3.show()

# Analysis 4: DTI Ratio Distribution
fig4 = px.box(df, x='loan_status', y='dti',
              title='Debt-to-Income Ratio Distribution by Loan Status',
              labels={'dti': 'Debt-to-Income Ratio', 'loan_status': 'Loan Status'})
fig4.show()

# Analysis 5: Interest Rate vs Loan Status
fig5 = px.box(df, x='loan_status', y='int_rate',
              title='Interest Rate Distribution by Loan Status',
              labels={'int_rate': 'Interest Rate (%)', 'loan_status': 'Loan Status'})
fig5.show()

# Calculate correlation matrix for numerical columns
correlation_cols = ['loan_amount', 'int_rate', 'instalment', 'dti', 
                   'annual_inc', 'delinq_2yrs', 'inq_last_6mths', 'loan_status']
correlation_matrix = df[correlation_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Key Features')
plt.show()

# Calculate and print risk factors
print("\nRISK FACTOR ANALYSIS\n")
print("\nAverage DTI Ratio\n")
print(f"Charged Off Loans: {charged_off_loans['dti'].mean():.2f}")
print(f"Late Loans: {late_loans['dti'].mean():.2f}")
print(f"Current Loans: {current_loans['dti'].mean():.2f}")

print("\n\nAverage Interest Rate\n")
print(f"Charged Off Loans: {charged_off_loans['int_rate'].mean():.2f}%")
print(f"Late Loans: {late_loans['int_rate'].mean():.2f}%")
print(f"Current Loans: {current_loans['int_rate'].mean():.2f}%")

In [None]:
# Create more detailed analysis of DTI and Interest Rate distributions

# Create subplots for DTI and Interest Rate analysis
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('DTI Distribution by Loan Status', 'Interest Rate Distribution by Loan Status'),
    vertical_spacing=0.15
)

# Add violin plots for DTI
fig.add_trace(
    go.Violin(x=[0]*len(current_loans), y=current_loans['dti'], name='Current',
              side='negative', line_color='green', meanline=dict(visible=True)),
    row=1, col=1
)
fig.add_trace(
    go.Violin(x=[1]*len(charged_off_loans), y=charged_off_loans['dti'], name='Charged Off',
              side='negative', line_color='red', meanline=dict(visible=True)),
    row=1, col=1
)
fig.add_trace(
    go.Violin(x=[2]*len(late_loans), y=late_loans['dti'], name='Late',
              side='negative', line_color='yellow', meanline=dict(visible=True)),
    row=1, col=1
)

# Add violin plots for Interest Rate
fig.add_trace(
    go.Violin(x=[0]*len(current_loans), y=current_loans['int_rate'], name='Current',
              side='negative', line_color='green', meanline=dict(visible=True)),
    row=2, col=1
)
fig.add_trace(
    go.Violin(x=[1]*len(charged_off_loans), y=charged_off_loans['int_rate'], name='Charged Off',
              side='negative', line_color='red', meanline=dict(visible=True)),
    row=2, col=1
)
fig.add_trace(
    go.Violin(x=[2]*len(late_loans), y=late_loans['int_rate'], name='Late',
              side='negative', line_color='yellow', meanline=dict(visible=True)),
    row=2, col=1
)

# Update layout
fig.update_layout(
    height=800,
    width=1000,
    title_text="Detailed Distribution Analysis of Risk Factors",
    showlegend=True
)

# Update x-axes labels
fig.update_xaxes(title_text="Loan Status", row=1, col=1)
fig.update_xaxes(title_text="Loan Status", row=2, col=1)

# Update y-axes labels
fig.update_yaxes(title_text="DTI Ratio", row=1, col=1)
fig.update_yaxes(title_text="Interest Rate (%)", row=2, col=1)

fig.show()

# Print statistical summary
print("\nDetailed Risk Factor Statistics:")
print("\nDTI Ratio Statistics:")
for status, name in [(current_loans, "Current"), (charged_off_loans, "Charged Off"), (late_loans, "Late")]:
    print(f"\n{name} Loans:")
    print(f"Mean DTI: {status['dti'].mean():.2f}")
    print(f"Median DTI: {status['dti'].median():.2f}")
    print(f"DTI Std Dev: {status['dti'].std():.2f}")

print("\nInterest Rate Statistics:")
for status, name in [(current_loans, "Current"), (charged_off_loans, "Charged Off"), (late_loans, "Late")]:
    print(f"\n{name} Loans:")
    print(f"Mean Interest Rate: {status['int_rate'].mean():.2f}%")
    print(f"Median Interest Rate: {status['int_rate'].median():.2f}%")
    print(f"Interest Rate Std Dev: {status['int_rate'].std():.2f}%")

# Key Findings from the Risk Factor Analysis

## 1. DTI (Debt-to-Income) Ratio
- Both Charged Off (4.08) and Late loans (4.05) show higher average DTI ratios compared to Current loans (3.90)
- This suggests that borrowers with higher debt relative to their income are more likely to default
- The difference is relatively small (~0.15-0.18 points) but consistent

## 2. Interest Rates
- Late loans have the highest average interest rate (3.94%)
- Charged Off loans actually have the lowest average rate (3.69%) 
- Current loans fall in the middle (3.85%)
- This is an interesting finding as it suggests that interest rate alone may not be the best predictor of default

## Recommendations

### DTI-Based Risk Assessment
- Consider implementing stricter DTI thresholds in loan approval processes
- Pay extra attention to loans where DTI exceeds 4.0
- Consider additional monitoring for borrowers with higher DTI ratios

### Interest Rate Considerations  
- The relationship between interest rates and default risk appears more complex than expected
- Other factors may be more important in predicting default risk
- Consider developing a more comprehensive risk assessment model that includes multiple factors

### Monitoring Strategy
- Set up early warning systems for loans where DTI exceeds the average of charged-off loans (4.08)
- Look for combinations of risk factors rather than relying on single metrics
- Consider implementing more frequent check-ins with borrowers who have higher DTI ratios

## Key Observations

1. **DTI Distribution**:
   - The median DTI is consistently higher than the mean across all loan statuses
   - Late loans show the highest median DTI (4.35)
   - The spread of DTI values is relatively consistent (std dev ~1.0-1.1)

2. **Interest Rate Patterns**:
   - Late loans show the highest median and mean interest rates
   - Interest rates are more tightly clustered (lower std dev) than DTI ratios
   - Charged off loans surprisingly show the lowest interest rates

In [None]:
# 1. Sub-grade Analysis
# Create sub-grade mapping (0-34 for A1-G5)
def create_subgrade_mapping():
    grades = 'ABCDEFG'
    subgrades = {}
    counter = 0
    for grade in grades:
        for num in range(1, 6):
            subgrades[counter] = f'{grade}{num}'
            counter += 1
    return subgrades

subgrade_mapping = create_subgrade_mapping()
df_decoded = df.copy()
df_decoded['subgrade_decoded'] = df['sub_grade'].map(subgrade_mapping)

# Create visualization for sub-grade analysis
fig1 = px.histogram(df_decoded, 
                   x='subgrade_decoded', 
                   color='loan_status',
                   title='Loan Sub-Grade Distribution by Status',
                   labels={'subgrade_decoded': 'Loan Sub-Grade'},
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})

fig1.update_layout(
    xaxis_title="Loan Sub-Grade",
    yaxis_title="Number of Loans",
    showlegend=True,
    legend_title="Loan Status (0=Current, 1=Charged Off, 2=Late)"
)
fig1.show()

# 2. Interest Rate Analysis by Sub-grade
fig2 = px.box(df_decoded, 
              x='subgrade_decoded', 
              y='int_rate',
              color='loan_status',
              title='Interest Rate Distribution by Sub-Grade and Loan Status',
              labels={'int_rate': 'Interest Rate (%)', 'subgrade_decoded': 'Loan Sub-Grade'},
              color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig2.show()

# 3. Annual Income Analysis
# Create income brackets
df_decoded['income_bracket'] = pd.qcut(df_decoded['annual_inc'], 
                                     q=5, 
                                     labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

fig3 = px.histogram(df_decoded, 
                   x='income_bracket', 
                   color='loan_status',
                   title='Loan Status Distribution by Income Bracket',
                   labels={'income_bracket': 'Income Bracket'},
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig3.show()

# 4. Home Ownership Analysis
fig4 = px.histogram(df_decoded, 
                   x='home_ownership', 
                   color='loan_status',
                   title='Loan Status Distribution by Home Ownership',
                   labels={'home_ownership': 'Home Ownership Status'},
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig4.show()

# 5. Purpose Analysis
fig5 = px.histogram(df_decoded, 
                   x='purpose', 
                   color='loan_status',
                   title='Loan Status Distribution by Purpose',
                   labels={'purpose': 'Loan Purpose'},
                   color_discrete_map={0: 'green', 1: 'red', 2: 'yellow'})
fig5.update_xaxes(tickangle=45)
fig5.show()

# Calculate risk metrics for each factor
def calculate_risk_metrics(df, column):
    metrics = pd.DataFrame({
        'Total_Loans': df.groupby(column).size(),
        'Charged_Off': df[df['loan_status'] == 1].groupby(column).size(),
        'Late': df[df['loan_status'] == 2].groupby(column).size()
    })
    
    metrics['Default_Rate'] = (metrics['Charged_Off'] / metrics['Total_Loans'] * 100).round(2)
    metrics['Late_Rate'] = (metrics['Late'] / metrics['Total_Loans'] * 100).round(2)
    metrics['Avg_Interest_Rate'] = df.groupby(column)['int_rate'].mean().round(2)
    metrics['Avg_DTI'] = df.groupby(column)['dti'].mean().round(2)
    metrics['Avg_Annual_Inc'] = df.groupby(column)['annual_inc'].mean().round(2)
    
    return metrics

# Calculate metrics for each factor
subgrade_metrics = calculate_risk_metrics(df_decoded, 'subgrade_decoded')
income_metrics = calculate_risk_metrics(df_decoded, 'income_bracket')
home_ownership_metrics = calculate_risk_metrics(df_decoded, 'home_ownership')
purpose_metrics = calculate_risk_metrics(df_decoded, 'purpose')

print("\nSub-grade Risk Analysis:")
print(subgrade_metrics)
print("\nIncome Bracket Risk Analysis:")
print(income_metrics)
print("\nHome Ownership Risk Analysis:")
print(home_ownership_metrics)
print("\nLoan Purpose Risk Analysis:")
print(purpose_metrics)

# Key Findings and Risk Analysis

## 1. Sub-grade Analysis

### Default Rate Patterns
- Highest default rates are in C4 (42.60%) and C5 (41.96%)
- Lowest default rates are in G4 (12.16%) and G5 (16.90%)
- Generally, A-grade loans show lower default rates (27-33%)

### Interest Rate Progression
- Clear progression from A1 (2.56%) to G3 (4.79%)
- Interest rates increase systematically with lower grades
- Rate spread of approximately 2.23 percentage points across grades

### Volume Distribution
- Highest concentration in B-grade loans (B1-B5: 2,929-3,641 loans)
- Lowest concentration in G-grade loans (G1-G5: 71-156 loans)
- Decreasing volume trend from B to G grades

## 2. Income Analysis

### Default Rates by Income
- Counter-intuitive finding: Higher income brackets show higher default rates
- Very High income: 38.10% default rate
- Very Low income: 32.22% default rate
- Consistent Late Rates across brackets (0.05-0.14%)

### DTI Patterns
- Higher DTI in Low income bracket (3.99)
- Lower DTI in Very High income bracket (3.54)
- Suggests better debt management in higher income groups

## 3. Home Ownership Analysis
(Using decoded values where 0=Rent, 3=Own, 4=Mortgage)

### Default Rates
- Highest in Own category (38.88%)
- Rent shows similar rate (38.04%)
- Mortgage shows lowest rate (31.92%)

### Interest Rate Distribution
- Mortgage holders: 3.67%
- Renters: 3.59%
- Owners: 3.65%
- Minimal variation across ownership types

## 4. Loan Purpose Analysis
(Using decoded values)

### Highest Risk Purposes
1. Purpose 1: 44.40% default rate
2. Purpose 2: 37.92% default rate
3. Purpose 4: 30.70% default rate

### Lowest Risk Purposes
1. Purpose 13: 9.28% default rate
2. Purpose 10: 12.66% default rate
3. Purpose 11: 15.78% default rate

## Risk Indicators for Potential Defaults

### Primary Risk Factors
1. Sub-grade: C4-C5 loans show highest default probability
2. Income: Counter-intuitively, higher income brackets show higher default rates
3. Purpose: Certain loan purposes (1 & 2) show significantly higher default rates

### Secondary Risk Factors
1. DTI: Higher DTI ratios in certain sub-grades correlate with higher defaults
2. Home Ownership: Mortgage holders show lower default rates than owners or renters
3. Interest Rates: Progressive increase with risk grade but not strongly predictive of defaults

## Recommendations

### Immediate Actions
1. Increase scrutiny on C4-C5 grade loans
2. Implement additional verification for high-income borrowers
3. Apply stricter criteria for high-risk loan purposes (1 & 2)

### Monitoring Strategy
1. Set up early warning system for loans with multiple risk factors
2. Regular monitoring of C-grade loans
3. Enhanced due diligence for high-income borrowers with high DTI

### Policy Adjustments
1. Consider adjusting interest rates for C-grade loans
2. Implement purpose-specific underwriting criteria
3. Develop composite risk scoring incorporating multiple factors

# Comprehensive Risk Factor Analysis and Interactions

## 1. Sub-grade and Interest Rate Correlation

### A-Grade Profile
- A1: 2.56% interest, 33.45% default
- A2: 2.67% interest, 31.33% default
- A3: 2.79% interest, 29.28% default
- A4: 2.86% interest, 27.37% default
- A5: 3.00% interest, 32.31% default
**Key Finding**: A4 shows best performance despite higher rate than A1-A3

### B-Grade Profile
- B1: 3.18% interest, 39.06% default
- B2: 3.31% interest, 37.83% default
- B3: 3.41% interest, 37.27% default
- B4: 3.51% interest, 37.21% default
- B5: 3.56% interest, 31.63% default
**Key Finding**: B5 shows surprisingly better performance despite highest rate

### C-Grade Profile (Critical Risk Area)
- C1: 3.67% interest, 36.91% default
- C2: 3.74% interest, 37.41% default
- C3: 3.82% interest, 39.69% default
- C4: 3.89% interest, 42.60% default
- C5: 3.97% interest, 41.96% default
**Key Finding**: Direct correlation between interest rate and default risk

## 2. Income Bracket Analysis

### Default Rate Paradox

- Very High Income: 38.10% default, 3.54 DTI
- High Income: 36.55% default, 3.81 DTI
- Medium Income: 35.38% default, 3.92 DTI
- Low Income: 35.53% default, 3.99 DTI
- Very Low Income: 32.22% default, 3.93 DTI

**Key Finding**: Inverse relationship between income and loan performance

## 3. Combined Risk Factors

### High-Risk Combinations
1. C4-C5 grade + Purpose 1:
   - Highest combined risk profile
   - Recommend automatic review

2. High Income + High DTI:
   - Counter-intuitive risk factor
   - Enhanced verification needed

3. B1-B3 grade + Purpose 2:
   - High volume, high risk combination
   - Consider specialized monitoring

### Low-Risk Combinations
1. A-grade + Purpose 13:
   - Lowest combined default risk
   - Potential for rate reduction

2. Mortgage holders + A/B grade:
   - Strong performance history
   - Preferred borrower profile

## 4. Risk Scoring Model Recommendations

### Primary Risk Factors (Weighted)
1. Sub-grade (40% weight)
   - C4-C5: High Risk (+3 points)
   - A1-A5: Low Risk (-2 points)
   - B1-B5: Medium Risk (+0 points)

2. Loan Purpose (30% weight)
   - Purpose 1: High Risk (+3 points)
   - Purpose 13: Low Risk (-2 points)
   - Other Purposes: Varied (+0 to +2 points)

3. DTI Ratio (20% weight)
   - >4.0: High Risk (+2 points)
   - 3.8-4.0: Medium Risk (+1 point)
   - <3.8: Low Risk (-1 point)

4. Income & Home Ownership (10% weight)
   - High Income: Additional scrutiny needed
   - Mortgage holders: Positive factor (-1 point)

### Risk Score Thresholds
- High Risk: >3 points
- Medium Risk: 1-3 points
- Low Risk: <1 point

## 5. Monitoring Framework

### Daily Monitoring
1. Track payments for:
   - C4-C5 grades
   - Purpose 1 loans
   - DTI > 4.0

### Weekly Review
1. Portfolio composition:
   - Grade distribution
   - Purpose distribution
   - DTI trends

### Monthly Analysis
1. Default rate trends by:
   - Sub-grade
   - Purpose
   - Income bracket

2. Interest rate effectiveness:
   - Rate vs default correlation
   - Risk-adjusted returns

## 6. Action Items

### Immediate Implementation
1. Enhanced screening for:
   - C4-C5 grade applications
   - Purpose 1 loans
   - High income + high DTI combinations

2. Rate adjustments:
   - Increase C4-C5 rates by 0.5-1.0%
   - Review Purpose 1 pricing
   - Consider rate reductions for low-risk combinations

3. Process changes:
   - Additional verification for high-income applications
   - Automated monitoring for high-risk combinations
   - Regular portfolio risk assessment

# Detailed Financial Risk Analysis and Recommendations

## 1. Grade-Based Risk Thresholds

### High-Risk Sub-grades (>40% Default Rate)
- C4: 42.60% default rate
- C5: 41.96% default rate
- Action: Implement enhanced screening for these sub-grades

### Medium-Risk Sub-grades (35-40% Default Rate)
- B1: 39.06%
- D1: 39.23%
- C3: 39.69%
- Action: Regular monitoring and stricter DTI requirements

### Lower-Risk Sub-grades (<30% Default Rate)
- G4: 12.16%
- G5: 16.90%
- G3: 22.99%
- Action: Consider for portfolio expansion

## 2. Interest Rate Analysis

### Rate Progression by Grade
- A1: 2.56% → G3: 4.79%
- Average spread: 2.23 percentage points
- Recommendation: Review pricing for C-grade loans where default risk increases significantly

### Risk-Adjusted Returns
- High-risk grades (C4-C5) interest rates: 3.89-3.97%
- Consider increasing rates for these grades given their >40% default rates

## 3. DTI Thresholds

### Warning Levels
- High Risk: DTI > 4.0 (C4, C5, F4, G5 average)
- Medium Risk: DTI 3.8-4.0
- Low Risk: DTI < 3.8 (A-grade average)

### Monitoring Triggers
- Immediate Review: DTI > 4.06 (C4 average)
- Enhanced Monitoring: DTI > 3.90 (Overall portfolio average)

## 4. Income-Based Risk Assessment

### Counter-intuitive Findings
- Very High Income: 38.10% default rate
- Very Low Income: 32.22% default rate
- Recommendation: Additional verification for high-income applications

### DTI by Income Level
- Very High Income: 3.54 DTI
- Low Income: 3.99 DTI
- Action: Implement income-specific DTI thresholds

## 5. Purpose-Based Risk Management

### High-Risk Purposes
1. Purpose 1: 44.40% default rate
   - Highest risk category
   - Requires strictest criteria
   
2. Purpose 2: 37.92% default rate
   - Second highest risk
   - Enhanced monitoring needed

### Low-Risk Purposes
1. Purpose 13: 9.28% default rate
2. Purpose 10: 12.66% default rate
   - Consider for portfolio expansion
   - Potentially lower interest rates

## 6. Home Ownership Considerations

### Default Rates by Ownership
- Own (3): 38.88%
- Rent (0): 38.04%
- Mortgage (4): 31.92%
- Action: Factor ownership status into risk assessment

## 7. Early Warning System Triggers

### Immediate Review Required When:
1. DTI exceeds 4.06
2. Sub-grade is C4 or C5
3. Purpose code is 1 or 2
4. High income with high DTI

### Enhanced Monitoring When:
1. DTI between 3.90-4.06
2. Sub-grade is C1-C3
3. Purpose code is 4 or 9
4. Recent missed payment

## 8. Portfolio Optimization Recommendations

### Target Portfolio Distribution
- A-grade loans: Increase allocation (currently showing stable performance)
- B-grade loans: Maintain current levels
- C-grade loans: Reduce exposure to C4-C5
- D-G grade loans: Selective approval based on other factors

### Risk-Based Pricing Adjustments
1. Increase rates for:
   - C4-C5 grades by 0.5-1.0 percentage points
   - Purpose 1 loans by 0.3-0.5 percentage points
   
2. Consider rate reductions for:
   - Purpose 13 loans (9.28% default rate)
   - Mortgage holders (31.92% default rate)