### Machine Learning Model Training and Evaluation:

In [9]:
import numpy as np
import pandas as pd
import kerastuner as kt

In [10]:
# Importing Transformed Dataset:
df = pd.read_csv('../Dataset/Loan_default_transformed.csv')
df.sample(4)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,HasMortgage,HasDependents,HasCoSigner,Default,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
188630,0.461538,0.022519,0.776162,0.774135,0.848739,2.0,0.279565,0.75,0.22,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
180975,0.461538,0.204298,0.891571,0.162113,0.134454,2.0,0.78913,0.5,0.11,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43665,0.307692,0.311825,0.996012,0.559199,0.378151,4.0,0.57,0.75,0.47,2.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
151360,0.153846,0.923088,0.602235,0.905282,0.588235,3.0,0.91913,0.5,0.11,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,HasMortgage,HasDependents,HasCoSigner,Default,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other


In [12]:
## Splitting the Dataset:
X = df.drop('Default',axis=1)
y = df[['Default']]

from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((204277, 20), (51070, 20), (204277, 1), (51070, 1))

#### Create an Evaluate Function to give all metrics after model Training:

In [13]:
## All classification models:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

models = {
   'Logistic Regression': LogisticRegression(),
   'Decision Tree': DecisionTreeClassifier(),
   # 'SVM': SVC(),
   'Guassian': GaussianNB(),
   # 'KNN': KNeighborsClassifier(),
   'AdaBoost': AdaBoostClassifier(),
   'Gradient Boost': GradientBoostingClassifier(),
   'XGBoost': XGBClassifier()
}

In [14]:
from sklearn.metrics import accuracy_score

model_list = []
acc_list = []

## Fit each model, predict and store it with model's accuracy
for name, model in models.items():
   # Fitting model
   model.fit(X_train,y_train.values.ravel())
   # Prediction
   y_pred = model.predict(X_test)
   
   acc = accuracy_score(y_test,y_pred)
   model_list.append(name)
   acc_list.append(acc)
   
   print(f'Model performance of {name} for Test set')
   print("- Accuracy score: {:.4f}".format(acc))
   
   print('='*35)
   print('\n')

Model performance of Logistic Regression for Test set
- Accuracy score: 0.8851


Model performance of Decision Tree for Test set
- Accuracy score: 0.8023


Model performance of Guassian for Test set
- Accuracy score: 0.8850


Model performance of AdaBoost for Test set
- Accuracy score: 0.8856


Model performance of Gradient Boost for Test set
- Accuracy score: 0.8865


Model performance of XGBoost for Test set
- Accuracy score: 0.8856




XGboost and Logistic Regression came out to best models

In [15]:
## Fitting Logistic Regression to its best:
LR = LogisticRegression()

param_grid = [
   {
      'penalty': ['l1', 'l2'],
      'solver': ['liblinear'],  # Only solvers that support l1 and elasticnet
      'max_iter': [100, 200, 500]
   },  {
      'penalty': ['l2'],
      'solver': ['lbfgs', 'newton-cg', 'sag'],  # solvers that only support l2
      'max_iter': [100, 200, 500]
   },  {
      'penalty': ['l1','l2','elasticnet'],
      'solver': ['saga'],  # solvers that only support l2
      'max_iter': [100, 200, 500],
      'l1_ratio': [0.1, 0.5, 0.9],
   }
]

In [16]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

clf = GridSearchCV(LR, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1, error_score='raise')
best_LR = clf.fit(X_train, y_train.values.ravel())

Fitting 3 folds for each of 42 candidates, totalling 126 fits


In [17]:
Best_LR = best_LR.best_params_

# Get the best parameters and score
print("Best parameters found: ", Best_LR)

pred_LR = best_LR.predict(X_test)
print("Best Accuracy from Logistic Regression: ", accuracy_score(y_test, pred_LR))

Best parameters found:  {'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Accuracy from Logistic Regression:  0.8850597219502644


In [18]:
## Parameters for XGBoost
Param_dict = {
   'max_depth': [3, 4, 5, 6, 8, 10],
   'min_child_weight': [3, 5, 7],
   'gamma': [0, 0.1, 0.2, 0.3, 0.4],
   'max_iter': [100, 200, 500],
}

In [19]:
from sklearn.model_selection import RandomizedSearchCV

XG = XGBClassifier()

clf = RandomizedSearchCV(XG, param_distributions=Param_dict, n_iter=30, cv=3, verbose=2, n_jobs=-1)
best_XG = clf.fit(X_train, y_train.values.reshape(-1,1))

Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "max_iter" } are not used.



In [20]:
Best_XG = best_XG.best_params_

# Get the best parameters and score
print("Best parameters found: ", Best_XG)

pred_XG = best_XG.predict(X_test)
print("Best Accuracy XGBoost: ", accuracy_score(y_test, pred_XG))

Best parameters found:  {'min_child_weight': 5, 'max_iter': 500, 'max_depth': 3, 'gamma': 0.3}
Best Accuracy XGBoost:  0.8866066183669473


#### Now, We will Fit a Neural Network for Same problem and compare its performance with Normal Machine Learning:

In [21]:
import tensorflow as tf
from tensorflow import keras as kr
# import optuna

In [22]:
## Creating Neural Network:
model = kr.Sequential() # Network base
model.add(kr.layers.Input(shape=(20,))) # Input layer

## Neural Network
# model.add(kr.layers.Dense(256, activation='relu'))
model.add(kr.layers.Dense(128, activation='relu'))
model.add(kr.layers.Dense(64, activation='relu'))
model.add(kr.layers.Dense(32, activation='relu'))
model.add(kr.layers.Dense(16, activation='relu'))

## Output layer
model.add(kr.layers.Dense(1, activation='sigmoid'))

In [23]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], )

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m5102/5107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8845 - loss: 0.3285

In [None]:
pred_ANN = model.predict(X_test)
pred_ANN_labels = (pred_ANN > 0.5).astype(int)

print("Best Accuracy Neural Network: ", accuracy_score(y_test, pred_ANN_labels))

Now, we will Tune this Neural Network:

In [None]:
## Function to build mode using various values from hyperparameters
def Build_best_classifier(hp):
  model = kr.Sequential()
  model.add(kr.Input(shape=(20,)))

  for i in range(hp.Int('num_layers', min_value=1, max_value=5)):
    ## Adding model layer structure
    model.add(  kr.layers.Dense(units=hp.Int("Unit_count",16,128, step=16),kernel_initializer='he_normal', kernel_regularizer=kr.regularizers.l2(0.01))
    )

    ## Adding activation func and batch normalizer
    model.add(kr.layers.LeakyReLU(negative_slope=0.2))
    model.add(kr.layers.BatchNormalization())

    ## Adding a dropout layer
    rate = hp.Float('rate', min_value=0.0, max_value=0.5, step=0.1)
    if hp.Boolean('dropout'):
      model.add(kr.layers.Dropout(rate))

  ## Output layer
  model.add(kr.layers.Dense(1, activation='sigmoid'))

  model.compile(optimizer=kr.optimizers.Adam(hp.Choice('learning_rate', [0.1, 1e-2, 1e-3])), loss='binary_crossentropy', metrics=['accuracy'])

  return model

In [None]:
tuner = kt.RandomSearch(Build_best_classifier, objective='val_acc', max_trials=25, directory='project', project_name='Loan_default')

## Fitting model
tuner.search(X_train, y_train, epochs=3, validation_data=(X_test,y_test))

Note: Hyper tuning Neural network or even using ANN dont show any significant improvement. XGBoost gives 