# **Loan Default Prediction (Model Prediction)**

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

from xgboost import XGBClassifier

## Load The Data

In [2]:
train_df = pd.read_csv("data/train_feat_data.csv")
test_df = pd.read_csv("data/test_feat_data.csv")

train_df.drop(columns=['Unnamed: 0'], inplace=True)
test_df.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
train_df.head(10)

Unnamed: 0,LoanID,Default,age_bin,log_income,loan_to_income,credit_score_bin,employment_tenure,credit_line_category,interest_rate_bin,short_long_term,...,has_cosigner,loan_income_dti_interaction,credit_interest_interaction,loan_term_income_interaction,employment_age_interaction,income_creditline_interaction,interest_loan_interaction,debt_burden,employment_credit_interaction,income_education_interaction
0,I38PQUQS96,0,Senior,11.362044,0.588262,Poor,5+ years,Medium,High,Short Term,...,1,0.258835,7919.6,0.000419,1.428571,21498.5,0.000301,20.361886,10400.0,171988
1,HPSK72WA7R,0,Senior,10.828401,2.467481,Poor,1-5 years,Low,Low,Long Term,...,1,1.677887,2202.98,0.00119,0.217391,50432.0,3.9e-05,17.453799,6870.0,151296
2,C1OZ6DPJ8Y,1,Middle-aged,11.341057,1.534154,Poor,1-5 years,Low,High,Short Term,...,0,0.475588,9547.67,0.000285,0.565217,28069.333333,0.000164,104.767839,3908.666667,252624
3,V2KKSFM3UN,0,Middle-aged,10.364514,1.412638,Excellent,<1 year,Low,Medium,Short Term,...,0,0.324907,5253.01,0.000757,0.0,10571.0,0.000158,43.423275,0.0,31713
4,EY08JDHTZP,0,Senior,9.925151,0.447179,Fair,<1 year,Medium,Medium,Long Term,...,0,0.326441,4120.83,0.002349,0.133333,5109.25,0.000712,3.987858,1266.0,40874
5,A9S62RQ7US,1,Young,11.410882,1.001661,Good,1-5 years,Low,High,Short Term,...,1,0.100166,16358.4,0.000266,0.72,45149.0,0.000251,227.577417,6480.0,90298
6,H8GXPAOS71,0,Middle-aged,11.618987,1.592123,Poor,5+ years,Low,High,Short Term,...,1,0.25474,8198.19,0.000108,2.105263,111188.0,0.000108,190.159221,34320.0,222376
7,0HGZQKJ36W,0,Senior,11.75039,1.226408,Poor,5+ years,Medium,Medium,Long Term,...,1,0.527355,4327.65,0.000473,1.196429,31700.5,5.2e-05,23.244712,8894.25,507208
8,1R0N3LGNRJ,1,Middle-aged,10.64671,2.196205,Excellent,5+ years,Low,High,Long Term,...,0,0.439241,19798.38,0.001141,2.305556,42053.0,0.000259,262.885713,68641.0,84106
9,CM9L1GTT2P,0,Middle-aged,11.796487,1.720915,Poor,5+ years,Medium,Medium,Long Term,...,1,0.567902,4363.2,0.000361,2.85,33196.0,4e-05,47.403391,13680.0,132784


In [4]:
test_df.head(10)

Unnamed: 0,LoanID,age_bin,log_income,loan_to_income,credit_score_bin,employment_tenure,credit_line_category,interest_rate_bin,short_long_term,dti_bin,...,has_cosigner,loan_income_dti_interaction,credit_interest_interaction,loan_term_income_interaction,employment_age_interaction,income_creditline_interaction,interest_loan_interaction,debt_burden,employment_credit_interaction,income_education_interaction
0,7RYZGMKJIR,Senior,11.787872,0.33269,Excellent,1-5 years,Low,Medium,Short Term,Low,...,0,0.04325,4892.2,0.000182,0.71875,65822.5,0.000139,15.610847,9223.0,131645
1,JDL5RH07AM,Senior,11.807928,0.137009,Poor,5+ years,Low,High,Long Term,High,...,1,0.080836,4793.31,0.000447,1.42623,67156.0,0.000706,3.016528,16051.5,134312
2,STAL716Y79,Middle-aged,11.659706,1.310554,Poor,<1 year,Low,Medium,Long Term,High,...,1,1.074655,3102.13,0.000414,0.054545,38603.0,3.6e-05,8.806287,563.0,231618
3,SO0KKJ3IQB,Middle-aged,11.461327,0.587438,Poor,1-5 years,Low,High,Short Term,High,...,0,0.452327,8064.41,0.000379,0.413793,94970.0,0.000429,18.256357,8088.0,189940
4,T99CWTYDCP,Senior,11.180636,2.646117,Poor,1-5 years,Low,High,Long Term,Medium,...,0,1.164291,9944.55,0.000669,0.825397,23909.0,0.000116,132.606521,7817.333333,286908
5,0SNHFWV4UP,Young,10.682675,4.496846,Poor,5+ years,Low,High,Long Term,High,...,0,3.057855,8729.5,0.001376,1.390625,14531.0,0.000101,130.606919,13112.666667,174372
6,S6ITP6LGYS,Middle-aged,11.194097,0.442633,Fair,5+ years,Low,Medium,Short Term,High,...,0,0.296564,4024.8,0.00033,2.05,36349.5,0.0002,4.261172,25584.0,145398
7,A6I7U12IRJ,Senior,10.378199,3.168958,Fair,5+ years,Low,High,Short Term,High,...,1,2.091512,8230.44,0.000373,2.377778,10716.666667,0.000126,61.554609,22898.0,128600
8,8W6KY50JU4,Middle-aged,11.760879,0.090215,Poor,5+ years,Low,High,Short Term,Low,...,1,0.028869,8633.52,0.000281,4.375,128139.0,0.001976,6.439062,39690.0,128139
9,THFQ08OLMU,Middle-aged,11.812215,1.276249,Poor,<1 year,Low,High,Short Term,High,...,0,0.855087,5152.64,8.9e-05,0.037736,134889.0,7.7e-05,25.296405,776.0,269778


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   LoanID                         255347 non-null  object 
 1   Default                        255347 non-null  int64  
 2   age_bin                        255347 non-null  object 
 3   log_income                     255347 non-null  float64
 4   loan_to_income                 255347 non-null  float64
 5   credit_score_bin               255347 non-null  object 
 6   employment_tenure              255347 non-null  object 
 7   credit_line_category           255347 non-null  object 
 8   interest_rate_bin              255347 non-null  object 
 9   short_long_term                255347 non-null  object 
 10  dti_bin                        255347 non-null  object 
 11  education_level                255347 non-null  int64  
 12  EmploymentType_Part-time      

In [6]:
train_df.isna().sum()

LoanID                           0
Default                          0
age_bin                          0
log_income                       0
loan_to_income                   0
credit_score_bin                 0
employment_tenure                0
credit_line_category             0
interest_rate_bin                0
short_long_term                  0
dti_bin                          0
education_level                  0
EmploymentType_Part-time         0
EmploymentType_Self-employed     0
EmploymentType_Unemployed        0
MaritalStatus_Married            0
MaritalStatus_Single             0
has_mortgage                     0
has_dependents                   0
LoanPurpose_Business             0
LoanPurpose_Education            0
LoanPurpose_Home                 0
LoanPurpose_Other                0
has_cosigner                     0
loan_income_dti_interaction      0
credit_interest_interaction      0
loan_term_income_interaction     0
employment_age_interaction       0
income_creditline_in

In [7]:
categorial_col = [feature for feature in train_df.columns if train_df[feature].dtype == 'object' and feature != 'LoanID']
numerical_col = [feature for feature in train_df.columns if train_df[feature].dtype != 'object']
categorial_col

['age_bin',
 'credit_score_bin',
 'employment_tenure',
 'credit_line_category',
 'interest_rate_bin',
 'short_long_term',
 'dti_bin']

In [8]:
train_df[categorial_col]

Unnamed: 0,age_bin,credit_score_bin,employment_tenure,credit_line_category,interest_rate_bin,short_long_term,dti_bin
0,Senior,Poor,5+ years,Medium,High,Short Term,Medium
1,Senior,Poor,1-5 years,Low,Low,Long Term,High
2,Middle-aged,Poor,1-5 years,Low,High,Short Term,Low
3,Middle-aged,Excellent,<1 year,Low,Medium,Short Term,Low
4,Senior,Fair,<1 year,Medium,Medium,Long Term,High
...,...,...,...,...,...,...,...
255342,Young,Poor,5+ years,Medium,High,Short Term,High
255343,Middle-aged,Poor,1-5 years,Low,High,Short Term,Low
255344,Senior,Fair,5+ years,Low,Medium,Long Term,Medium
255345,Middle-aged,Excellent,1-5 years,Low,High,Long Term,Medium


In [9]:
for col in categorial_col:
    print(train_df[col].value_counts())
    print('\n')

age_bin
Middle-aged    98601
Senior         98123
Young          58623
Name: count, dtype: int64


credit_score_bin
Poor         129767
Excellent     51695
Fair          41816
Good          32069
Name: count, dtype: int64


employment_tenure
5+ years     127724
1-5 years    102116
<1 year       25507
Name: count, dtype: int64


credit_line_category
Low       191518
Medium     63829
Name: count, dtype: int64


interest_rate_bin
High      166396
Medium     55659
Low        33292
Name: count, dtype: int64


short_long_term
Short Term    153027
Long Term     102320
Name: count, dtype: int64


dti_bin
High      126129
Low        81157
Medium     48061
Name: count, dtype: int64




## Train / Test Split

In [10]:
X = train_df.drop(columns=['LoanID','Default'], axis=1)
y = train_df['Default']
X_test = test_df.drop(columns=['LoanID'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.25, stratify=y, random_state=42)

X_train.shape, X_test.shape

((191510, 31), (109435, 31))

## Pipeline

In [None]:
class FeatureEngineetingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ordinal_columns = categorial_col
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    def fit(self, X, y=None):
        X_transformed = X.copy()

        self.ordinal_encoder.fit(X_transformed[self.ordinal_columns])
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()

        to_encode_ordinal = self.ordinal_encoder.transform(X_copy[self.ordinal_columns])
        to_encode_ordinal_df =  pd.DataFrame(to_encode_ordinal, columns=[f"ordinal_{col}" for col in self.ordinal_columns], index=X_copy.index)
        X_copy = pd.concat([X_copy, to_encode_ordinal_df], axis=1)
        
        X_copy.drop(columns=categorial_col, inplace=True)
        print(X_copy)
        return X_copy
    

## Model Implementation & Prediction

## **Logistic Regression**

In [None]:
param_grid = {
    'lr__C': [0.01], 
    'lr__max_iter': [100], 
    'lr__penalty': ['l2'], 
    'lr__solver': ['liblinear']
}
logRegression = LogisticRegression()

pipefinallr = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('lr', logRegression)    
])

grid_search_lr = GridSearchCV(estimator=pipefinallr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search_lr.fit(X_train, y_train)

prediction_data_lr = grid_search_lr.predict(X_test)
train_accuracy_lr = accuracy_score(y_train, grid_search_lr.predict(X_train))
test_accuracy_lr = accuracy_score(y_valid, prediction_data_lr)

print("Best parameters found: ", grid_search_lr.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_lr.best_score_))
print("Training accuracy: {:.3f}".format(train_accuracy_lr))
print("Test accuracy: {:.3f}".format(test_accuracy_lr))

if train_accuracy_lr < test_accuracy_lr and test_accuracy_lr < grid_search_lr.best_score_:
    print("Underfitting: The model may be too simple for the data.")
elif train_accuracy_lr > test_accuracy_lr:
    print("Overfitting: The model may be overfitting to the training data.")
else:
    print("Good fit: The model seems to generalize well.")

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [None]:
print("Classification report:", classification_report(y_valid, prediction_data_lr))

In [None]:
cm = confusion_matrix(y_valid, prediction_data_lr)
plt.figure(figsize=(10, 6), dpi=500)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
             xticklabels=['Defaulted', 'Not Default'], 
             yticklabels=['Defaulted', 'Not Default'])
plt.text(0.5, 0.6, 'True Defaulted', ha='center', va='center', color='white')
plt.text(1.5, 0.6, 'False Not Default', ha='center', va='center', color='black')
plt.text(0.5, 1.6, 'False Defaulted', ha='center', va='center', color='black')
plt.text(1.5, 1.6, 'True Not Default', ha='center', va='center', color='black')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
predicted_counts = pd.Series(prediction_data_lr).value_counts(normalize=True) * 100
predicted_counts.index = ['Defaulted', 'Not Default']
actual_distribution = {'Defaulted': 68, 'Not Default': 32}
comparison_df = pd.DataFrame({
    'Defaulted': predicted_counts,
    'Not Default': actual_distribution.values()
}, index=actual_distribution.keys())
print("\nPredicted vs Actual Stay Purpose:", comparison_df)

In [None]:
comparison_df.plot(kind='bar', figsize=(8, 6), color=['#00008B', '#ADD8E6'])
plt.title('Predicted vs Actual Loan Defaulted Distribution')
plt.ylabel('Percentage')
plt.xlabel('') 
plt.xticks(rotation=0)
plt.show()

In [None]:
roc_auc = roc_auc_score(y_valid, prediction_data_lr)
print(f"ROC AUC Score: {roc_auc:.2f}")
fpr, tpr, _ = roc_curve(y_valid, prediction_data_lr)
plt.figure(figsize=(8, 6), dpi=500)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", frameon=False)
plt.show()


## **Random Forest Classification**

In [None]:
param_grid_rfc = {
    'rfc__n_estimators': [2000],
    'rfc__max_depth': [None],
    'rfc__min_samples_split': [2],
    'rfc__min_samples_leaf': [4],
    'rfc__max_features': ['sqrt'],
    'rfc__bootstrap': [True],
    'rfc__class_weight': [None]
}

model_rf = RandomForestClassifier(random_state=42)

pipefinalrf = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('lr', logRegression)    
])

grid_search_rf = GridSearchCV(estimator=pipefinalrf, param_grid=param_grid_rfc, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
prediction_data_rf = grid_search_rf.predict(X_test)
train_accuracy_rf = accuracy_score(y_train, grid_search_lr.predict(X_train))
test_accuracy_rf = accuracy_score(y_valid, prediction_data_rf)

print("Best parameters found: ", grid_search_rf.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_rf.best_score_))
print("Training accuracy: {:.3f}".format(train_accuracy_rf))
print("Test accuracy: {:.3f}".format(test_accuracy_rf))

if train_accuracy_rf < test_accuracy_rf and test_accuracy_rf < grid_search_rf.best_score_:
    print("Underfitting: The model may be too simple for the data.")
elif train_accuracy_rf > test_accuracy_rf:
    print("Overfitting: The model may be overfitting to the training data.")
else:
    print("Good fit: The model seems to generalize well.")


In [None]:
print("Classification report:", classification_report(y_valid, prediction_data_rf))

In [None]:
cm = confusion_matrix(y_valid, prediction_data_rf)
plt.figure(figsize=(10, 6), dpi=500)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
             xticklabels=['Defaulted', 'Not Default'], 
             yticklabels=['Defaulted', 'Not Default'])
plt.text(0.5, 0.6, 'True Defaulted', ha='center', va='center', color='white')
plt.text(1.5, 0.6, 'False Not Default', ha='center', va='center', color='black')
plt.text(0.5, 1.6, 'False Defaulted', ha='center', va='center', color='black')
plt.text(1.5, 1.6, 'True Not Default', ha='center', va='center', color='black')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
predicted_counts = pd.Series(prediction_data_rf).value_counts(normalize=True) * 100
predicted_counts.index = ['Defaulted', 'Not Default']
actual_distribution = {'Defaulted': 68, 'Not Default': 32}
comparison_df = pd.DataFrame({
    'Defaulted': predicted_counts,
    'Not Default': actual_distribution.values()
}, index=actual_distribution.keys())
print("\nPredicted vs Actual Stay Purpose:", comparison_df)

In [None]:
comparison_df.plot(kind='bar', figsize=(8, 6), color=['#00008B', '#ADD8E6'])
plt.title('Predicted vs Actual Loan Defaulted Distribution')
plt.ylabel('Percentage')
plt.xlabel('') 
plt.xticks(rotation=0)
plt.show()

In [None]:
roc_auc = roc_auc_score(y_valid, prediction_data_rf)
print(f"ROC AUC Score: {roc_auc:.2f}")
fpr, tpr, _ = roc_curve(y_valid, prediction_data_rf)
plt.figure(figsize=(8, 6), dpi=500)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", frameon=False)
plt.show()


## **Gradient Boosting Classification**

In [None]:
param_grid_gb = {
    'gb__n_estimators': [100], 
    'gb__learning_rate': [0.05, 0.001],
    'gb__max_depth': [7],
    'gb__min_samples_split': [2],  
    'gb__min_samples_leaf': [4], 
    'gb__subsample': [0.8], 
    'gb__max_features': ['sqrt'] 
}

gb = GradientBoostingClassifier(n_iter_no_change=5, validation_fraction=0.2, random_state=42)

pipefinalgb = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('gb', gb)    
])

grid_search_gb = GridSearchCV(estimator=pipefinalgb, param_grid=param_grid_gb, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
predictions_gb = grid_search_gb.predict(X_test)

train_accuracy_gb = accuracy_score(y_train, grid_search_gb.predict(X_train))
test_accuracy_gb = accuracy_score(y_valid, predictions_gb)

print("Best parameters found: ", grid_search_gb.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_gb.best_score_))
print("Training accuracy: {:.3f}".format(train_accuracy_gb))
print("Test accuracy: {:.3f}".format(test_accuracy_gb))

if train_accuracy_gb < test_accuracy_gb and test_accuracy_gb < grid_search_gb.best_score_:
    print("Underfitting: The model may be too simple for the data.")
elif train_accuracy_gb > test_accuracy_gb:
    print("Overfitting: The model may be overfitting to the training data.")
else:
    print("Good fit: The model seems to generalize well.")

In [None]:
print("Classification report:", classification_report(y_valid, predictions_gb))

In [None]:
cm = confusion_matrix(y_valid, predictions_gb)
plt.figure(figsize=(10, 6), dpi=500)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
             xticklabels=['Defaulted', 'Not Default'], 
             yticklabels=['Defaulted', 'Not Default'])
plt.text(0.5, 0.6, 'True Defaulted', ha='center', va='center', color='white')
plt.text(1.5, 0.6, 'False Not Default', ha='center', va='center', color='black')
plt.text(0.5, 1.6, 'False Defaulted', ha='center', va='center', color='black')
plt.text(1.5, 1.6, 'True Not Default', ha='center', va='center', color='black')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
predicted_counts = pd.Series(predictions_gb).value_counts(normalize=True) * 100
predicted_counts.index = ['Defaulted', 'Not Default']
actual_distribution = {'Defaulted': 68, 'Not Default': 32}
comparison_df = pd.DataFrame({
    'Predicted': predicted_counts,
    'Actual': actual_distribution.values()
}, index=actual_distribution.keys())
print("\nPredicted vs Actual Stay Purpose:", comparison_df)

In [None]:
comparison_df.plot(kind='bar', figsize=(8, 6), color=['#00008B', '#ADD8E6'])
plt.title('Predicted vs Actual Loan Defaulted Distribution')
plt.ylabel('Percentage')
plt.xlabel('') 
plt.xticks(rotation=0)
plt.show()

In [None]:
roc_auc = roc_auc_score(y_valid, predictions_gb)
print(f"ROC AUC Score: {roc_auc:.2f}")
fpr, tpr, _ = roc_curve(y_valid, predictions_gb)
plt.figure(figsize=(8, 6), dpi=500)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", frameon=False)
plt.show()


## **XGBClassifier**

In [None]:
param_grid_xgb = {
    'xgb__n_estimators': [300],
    'xgb__learning_rate': [0.05],
    'xgb__max_depth': [5],
    'xgb__min_child_weight': [6],
    'xgb__subsample': [0.7],
    'xgb__colsample_bytree': [0.7],
    'xgb__gamma': [0.1],
    'xgb__reg_alpha': [0.01],
    'xgb__reg_lambda': [1], 
}
xgb = XGBClassifier(objective='binary:logistic', random_state=42)

pipefinalxgb = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('xgb', xgb)
])

grid_search_xgb = GridSearchCV(estimator=pipefinalxgb, param_grid=param_grid_xgb, 
                      cv=3, n_jobs=-1, verbose=1)
grid_search_xgb.fit(X_train, y_train)
predictions_xgb = grid_search_xgb.predict(X_test)

train_accuracy_xgb = accuracy_score(y_train, grid_search_xgb.predict(X_train))
test_accuracy_xgb = accuracy_score(y_valid, predictions_xgb)

print("Best parameters found: ", grid_search_xgb.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_xgb.best_score_))
print("Training accuracy: {:.3f}".format(train_accuracy_xgb))
print("Test accuracy: {:.3f}".format(test_accuracy_xgb))

if train_accuracy_xgb < test_accuracy_xgb and test_accuracy_xgb < grid_search_xgb.best_score_:
    print("Underfitting: The model may be too simple for the data.")
elif train_accuracy_xgb > test_accuracy_xgb:
    print("Overfitting: The model may be overfitting to the training data.")
else:
    print("Good fit: The model seems to generalize well.")


In [None]:
print("Classification report:", classification_report(y_valid, predictions_xgb))

In [None]:
cm = confusion_matrix(y_valid, predictions_xgb)
plt.figure(figsize=(10, 6), dpi=500)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
             xticklabels=['Defaulted', 'Not Default'], 
             yticklabels=['Defaulted', 'Not Default'])
plt.text(0.5, 0.6, 'True Defaulted', ha='center', va='center', color='white')
plt.text(1.5, 0.6, 'False Not Default', ha='center', va='center', color='black')
plt.text(0.5, 1.6, 'False Defaulted', ha='center', va='center', color='black')
plt.text(1.5, 1.6, 'True Not Default', ha='center', va='center', color='black')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
predicted_counts = pd.Series(predictions_xgb).value_counts(normalize=True) * 100
predicted_counts.index = ['Defaulted', 'Not Default']
actual_distribution = {'Defaulted': 68, 'Not Default': 32}
comparison_df = pd.DataFrame({
    'Predicted': predicted_counts,
    'Actual': actual_distribution.values()
}, index=actual_distribution.keys())
print("\nPredicted vs Actual Stay Purpose:", comparison_df)

In [None]:
comparison_df.plot(kind='bar', figsize=(8, 6), color=['#00008B', '#ADD8E6'])
plt.title('Predicted vs Actual Loan Defaulted Distribution')
plt.ylabel('Percentage')
plt.xlabel('') 
plt.xticks(rotation=0)
plt.show()

In [None]:
roc_auc = roc_auc_score(y_valid, predictions_xgb)
print(f"ROC AUC Score: {roc_auc:.2f}")
fpr, tpr, _ = roc_curve(y_valid, predictions_xgb)
plt.figure(figsize=(8, 6), dpi=500)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right", frameon=False)
plt.show()
