# Task Three

The risk manager has collected data on the loan borrowers. The data is in tabular format, with each row providing details of the borrower, including their income, total loans outstanding, and a few other metrics. There is also a column indicating if the borrower has previously defaulted on a loan. You must use this data to build a model that, given details for any loan described above, will predict the probability that the borrower will default (also known as PD: the probability of default). Use the provided data to train a function that will estimate the probability of default for a borrower. Assuming a recovery rate of 10%, this can be used to give the expected loss on a loan.

* You should produce a function that can take in the properties of a loan and output the expected loss.
* You can explore any technique ranging from a simple regression or a decision tree to something more advanced. You can also use multiple methods and provide a comparative analysis.


In [1]:
# First, we read in the loan data that consists of columns for
# customer_id, credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding,
# income, years_employed, fico_score and default (1 if the customer defaulted, 0 otherwise)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

data = pd.read_csv('Loan_Data.csv')

# Preprocess data: handle missing values, encode categorical variables if any
data = data.dropna()
X = data.drop(columns=['customer_id', 'default'])
y = data['default']


In [2]:
# Next we define new features based on existing columns to improve model performance
# such as debt_to_income_ratio and payment_to_income_ratio
X['debt_to_income_ratio'] = X['total_debt_outstanding'] / X['income']
X['payment_to_income_ratio'] = X['loan_amt_outstanding'] / X['income']

# We can then drop the original columns used to create these features if desired
X = X.drop(columns=['total_debt_outstanding', 'loan_amt_outstanding', 'income'])


In [3]:
# Next, we split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train, y_train)

# Evaluate the model
y_pred_LR = LR.predict(X_test)
print(confusion_matrix(y_test, y_pred_LR))
print(classification_report(y_test, y_pred_LR))

[[1649    3]
 [  10  338]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1652
           1       0.99      0.97      0.98       348

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [5]:
# Train a Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X_train, y_train)

# Evaluate the model
y_pred_DT = DT.predict(X_test)
print(confusion_matrix(y_test, y_pred_DT))
print(classification_report(y_test, y_pred_DT))

[[1641   11]
 [   8  340]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1652
           1       0.97      0.98      0.97       348

    accuracy                           0.99      2000
   macro avg       0.98      0.99      0.98      2000
weighted avg       0.99      0.99      0.99      2000



In [6]:
# Train a Random Forest Classifier
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)

# Evaluate the model
y_pred_RF = RF.predict(X_test)
print(confusion_matrix(y_test, y_pred_RF))
print(classification_report(y_test, y_pred_RF))

[[1649    3]
 [   6  342]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.99      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000



In [7]:
# Train a Neural Network Classifier
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=1000, random_state=42)
NN.fit(X_train, y_train)

# Evaluate the model
y_pred_NN = NN.predict(X_test)
print(confusion_matrix(y_test, y_pred_NN))
print(classification_report(y_test, y_pred_NN))

[[1649    3]
 [  13  335]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1652
           1       0.99      0.96      0.98       348

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [8]:
# Write a function that takes in the properties of a loan and outputs the expected loss
# for every choice of model (LR, DT, RF, NN)
# Assume a recovery rate of 10% to calculate the expected loss

def expected_loss(credit_lines_outstanding: int, loan_amt_outstanding: float, total_debt_outstanding: float,
                   income: float, years_employed: int, fico_score: int, model='LR') -> float:
    # Create a DataFrame for the input loan
    loan_data = pd.DataFrame({
        'credit_lines_outstanding': [credit_lines_outstanding],
        'years_employed': [years_employed],
        'fico_score': [fico_score],
        'debt_to_income_ratio': [total_debt_outstanding / income],
        'payment_to_income_ratio': [loan_amt_outstanding / income]
    })
    
    # Select the appropriate model
    if model == 'LR':
        clf = LR
    elif model == 'DT':
        clf = DT
    elif model == 'RF':
        clf = RF
    elif model == 'NN':
        clf = NN
    else:
        raise ValueError("Model must be one of: 'LR', 'DT', 'RF', 'NN'")
    
    # Predict the probability of default
    prob_default = clf.predict_proba(loan_data)[:, 1][0]
    
    # Calculate expected loss
    recovery_rate = 0.1
    expected_loss_value = prob_default * loan_amt_outstanding * (1 - recovery_rate)
    
    return expected_loss_value

In [11]:
# Test that the function works for every model
test_loss = expected_loss(5, 20000, 50000, 60000, 3, 680, model='LR')
print('Expected loss for test loan in USD using LR:', test_loss)

test_loss = expected_loss(5, 20000, 50000, 60000, 3, 680, model='RF')
print('Expected loss for test loan in USD using RF:', test_loss)

test_loss = expected_loss(5, 20000, 50000, 60000, 3, 680, model='DT')
print('Expected loss for test loan in USD using DT:', test_loss)

test_loss = expected_loss(5, 20000, 50000, 60000, 3, 680, model='NN')
print('Expected loss for test loan in USD using NN:', test_loss)


Expected loss for test loan in USD using LR: 17999.99860288509
Expected loss for test loan in USD using RF: 18000.0
Expected loss for test loan in USD using DT: 18000.0
Expected loss for test loan in USD using NN: 17999.968237625297
