In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
!pip install xgboost --quiet
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('Loan_Data.csv')

In [3]:
# Construct Validation Data
X = data[['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']]
y = data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [4]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict PD for test data
y_pred_prob = model.predict_proba(X_test)[:,1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_prob > 0.5)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate expected loss
recovery_rate = 0.1
exposure = 10000  # Replace with the loan amount
expected_loss = y_pred_prob * exposure * (1 - recovery_rate)

print("Logistic Regression Results:")
print(f'Accuracy: {accuracy}')
print(f'ROC AUC: {roc_auc}')
print(f'Expected Loss: {expected_loss.mean()}')

Logistic Regression Results:
Accuracy: 0.98
ROC AUC: 0.9985759493670886
Expected Loss: 1855.7003668631414


In [5]:
# We can compare this with a Decision Tree model
#Train a Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict PD for test data
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_prob > 0.5)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate expected loss
recovery_rate = 0.1
exposure = 10000  # Replace with the loan amount
expected_loss = y_pred_prob * exposure * (1 - recovery_rate)

print("Decision Tree Model Results:")
print(f'Accuracy: {accuracy}')
print(f'ROC AUC: {roc_auc}')
print(f'Expected Loss: {expected_loss.mean()}')

Decision Tree Model Results:
Accuracy: 0.9935
ROC AUC: 0.9888939119951778
Expected Loss: 1876.5


In [6]:
# Train an XGBoost model
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Predict PD for test data
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_prob > 0.5)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate expected loss
recovery_rate = 0.1
exposure = 10000  # Replace with the loan amount
expected_loss = y_pred_prob * exposure * (1 - recovery_rate)

print("XGBoost Model Results:")
print(f'Accuracy: {accuracy}')
print(f'ROC AUC: {roc_auc}')
print(f'Expected Loss: {expected_loss.mean()}')

XGBoost Model Results:
Accuracy: 0.9975
ROC AUC: 0.9998915009041591
Expected Loss: 1876.2840576171875


Some Observations:

All three models perform well with the decision tree model and the xgboost model performing better than linear regression. A high AUC means our model is good at identifying potential defaulters, which can help mitigate financial losses. Since accurately identifying potential loan defaults is a primary concern of the client, a high AUC is more important than a high accuracy score. Of the three models compared here, xgboost is the best choice.