# **Loan Default Prediction**

## Import Modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

## Load The Data

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [3]:
train_df.head(5)

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:
test_df.head(5)

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,7RYZGMKJIR,32,131645,43797,802,23,2,6.1,24,0.13,High School,Full-time,Divorced,Yes,No,Other,No
1,JDL5RH07AM,61,134312,18402,369,87,2,12.99,60,0.59,High School,Self-employed,Single,No,No,Business,Yes
2,STAL716Y79,55,115809,151774,563,3,3,5.51,48,0.82,Bachelor's,Full-time,Single,Yes,Yes,Other,Yes
3,SO0KKJ3IQB,58,94970,55789,337,24,1,23.93,36,0.77,Bachelor's,Unemployed,Divorced,No,No,Business,No
4,T99CWTYDCP,63,71727,189798,451,52,3,22.05,48,0.44,PhD,Unemployed,Single,Yes,No,Auto,No


## Train Test Split

In [5]:
X = train_df.drop(columns=['LoanID', 'Default'], axis=1)
y = train_df['Default']
X_test = test_df.drop(columns=['LoanID'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, stratify=y, random_state=42)

X_train.shape

(204277, 16)

## Pipeline

In [6]:
education_mapping = {'High School': 1, "Bachelor's": 2, "Master's": 3, 'PhD': 4}

class FeatureEngineetingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ordinal_columns = ['age_bin', 'credit_score_bin', 'employment_tenure', 'short_long_term', 
                           'credit_line_category', 'interest_rate_bin', 'dti_bin', 'education_level']
        
        self.onehot_columns = ['HasMortgage', 'HasDependents', 'HasCoSigner', 'LoanPurpose', 'EmploymentType', 'MaritalStatus']
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    
    def create_bins(self, X):
        X_copy = X.copy()

        # Create binned columns
        X_copy['age_bin'] = pd.cut(X_copy['Age'], bins=[18, 30, 50, 100], labels=['Young', 'Middle-aged', 'Senior'])
        X_copy['credit_score_bin'] = pd.cut(X_copy['CreditScore'], bins=[300, 579, 669, 739, 850], labels=['Poor', 'Fair', 'Good', 'Excellent'])
        X_copy['employment_tenure'] = pd.cut(X_copy['MonthsEmployed'], bins=[0, 12, 60, 1000], labels=['<1 year', '1-5 years', '5+ years'])
        X_copy['short_long_term'] = np.where(X_copy['LoanTerm'] <= 36, 'Short Term', 'Long Term')
        X_copy['credit_line_category'] = pd.cut(X_copy['NumCreditLines'], bins=[0, 3, 6, 100], labels=['Low', 'Medium', 'High'])
        X_copy['interest_rate_bin'] = pd.cut(X_copy['InterestRate'], bins=[0, 5, 10, 100], labels=['Low', 'Medium', 'High'])
        X_copy['dti_bin'] = pd.cut(X_copy['DTIRatio'], bins=[0, 0.35, 0.5, 1], labels=['Low', 'Medium', 'High'])
        X_copy['education_level'] = X_copy['Education'].map(education_mapping)

        return X_copy
        
    def fit(self, X, y=None):
        
        X_transformed = X.copy()
        X_transformed = self.create_bins(X)
        
        self.ordinal_encoder.fit(X_transformed[self.ordinal_columns])
        self.one_hot_encoder.fit(X_transformed[self.onehot_columns])

        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy = self.create_bins(X_copy)

        X_copy['log_income'] = np.log1p(X_copy['Income'])
        X_copy['loan_to_income'] = X_copy['LoanAmount'] / X_copy['Income']

        to_encode_ordinal = self.ordinal_encoder.transform(X_copy[self.ordinal_columns])
        to_encode_ordinal_df =  pd.DataFrame(to_encode_ordinal, columns=self.ordinal_encoder.get_feature_names_out(), index=X_copy.index)
        X_copy = pd.concat([X_copy, to_encode_ordinal_df], axis=1)

        one_hot_encoded = self.one_hot_encoder.transform(X_copy[self.onehot_columns])
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=self.one_hot_encoder.get_feature_names_out(), index=X_copy.index)
        X_copy = pd.concat([X_copy, one_hot_encoded_df], axis=1)

        X_copy['loan_income_dti_interaction'] = X_copy['loan_to_income'] * X_copy['DTIRatio']
        X_copy['credit_interest_interaction'] = X_copy['CreditScore'] * X_copy['InterestRate']

        columns_to_drop = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                           'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 
                           'Education', 'HasMortgage', 'HasDependents', 'HasCoSigner'] + self.ordinal_columns + self.onehot_columns
        
        X_copy.drop(columns=columns_to_drop, inplace=True)
        
        return X_copy


## Model Implementation

In [7]:
param_grid = {
    'lr__C': [0.01, 0.1], 
    'lr__penalty': ['l2'],  
    'lr__solver': ['saga'], 
    'lr__max_iter': [25, 50, 75, 100] 
}

logRegression = LogisticRegression()

pipefinallr = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('lr', logRegression)    
])

CV_lr = GridSearchCV(estimator=pipefinallr, param_grid=param_grid, cv=StratifiedKFold(n_splits=3), n_jobs=-1,  verbose=1)
CV_lr.fit(X_train, y_train)
train_score = CV_lr.score(X_train, y_train)

print(CV_lr.best_params_)
print("Training accuracy:", train_score)
print("Validation accuracy (Cross-Validation):", CV_lr.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
{'lr__C': 0.01, 'lr__max_iter': 25, 'lr__penalty': 'l2', 'lr__solver': 'saga'}
Training accuracy: 0.8845831885136359
Validation accuracy (Cross-Validation): 0.8845831898021591


In [8]:
test_predictions_lr = CV_lr.predict(X_test)
test_predictions_lr

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
param_grid_gb = {
    'gb__n_estimators': [100], 
    'gb__learning_rate': [0.05, 0.001],
    'gb__max_depth': [7,9],
    'gb__min_samples_split': [2],  
    'gb__min_samples_leaf': [2,3,4,5], 
    'gb__subsample': [0.8, 1.0], 
    'gb__max_features': ['sqrt'] 
}

gb = GradientBoostingClassifier(n_iter_no_change=5, validation_fraction=0.2, random_state=42)

pipefinalgb = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('gb', gb)
])

CV_gb = GridSearchCV(estimator=pipefinalgb, param_grid=param_grid_gb,  cv=StratifiedKFold(n_splits=3), n_jobs=-1, verbose=1)
CV_gb.fit(X_train, y_train)

print(CV_gb.best_params_)
print("Training accuracy:", CV_gb.score(X_train, y_train))
print("Validation accuracy (Cross-Validation):", CV_gb.best_score_)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
{'gb__learning_rate': 0.05, 'gb__max_depth': 7, 'gb__max_features': 'sqrt', 'gb__min_samples_leaf': 4, 'gb__min_samples_split': 2, 'gb__n_estimators': 100, 'gb__subsample': 1.0}
Training accuracy: 0.8862671764320016
Validation accuracy (Cross-Validation): 0.8843482143124195


In [10]:
test_predictions_gb = CV_gb.predict(X_test)
test_predictions_gb

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [11]:
param_grid_rfc = {
    'rfc__n_estimators': [2000],
    'rfc__max_depth': [None],
    'rfc__min_samples_split': [2],
    'rfc__min_samples_leaf': [4],
    'rfc__max_features': ['sqrt'],
    'rfc__bootstrap': [True],
    'rfc__class_weight': [None]
}
## {'rfc__n_estimators': 1000, 'rfc__min_samples_split': 2, 'rfc__min_samples_leaf': 4, 'rfc__max_features': 'sqrt', 'rfc__max_depth': None, 'rfc__class_weight': None, 'rfc__bootstrap': True}
rfc = RandomForestClassifier()

pipefinalrfc = Pipeline(steps=[
    ('feature_engineering', FeatureEngineetingTransformer()),
    ('scaler', StandardScaler()),
    ('rfc', rfc)    
])

CV_rfc = GridSearchCV(estimator=pipefinalrfc, param_grid=param_grid_rfc,  cv=StratifiedKFold(n_splits=3), n_jobs=-1, verbose=1)
CV_rfc.fit(X_train, y_train)
train_score = CV_rfc.score(X_train, y_train)

print(CV_rfc.best_params_)
print("Training accuracy:", train_score)
print("Validation accuracy (Cross-Validation):", CV_rfc.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'rfc__bootstrap': True, 'rfc__class_weight': None, 'rfc__max_depth': None, 'rfc__max_features': 'sqrt', 'rfc__min_samples_leaf': 4, 'rfc__min_samples_split': 2, 'rfc__n_estimators': 2000}
Training accuracy: 0.8928464780665469
Validation accuracy (Cross-Validation): 0.8843139470924407


In [12]:
test_predictions_rfc = CV_rfc.predict(X_test)
test_predictions_rfc

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)