## Notebook description

This notebook contains codes for:
- Part 1: Class imbalance techniques - SMOTE, ADASYN, ENN, SMOTEENN, SMOTETomek
- Part 2: Applying adaboost on the ENN-handled model
- Part 3: Hyperparameter tune it

In [1]:
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek 

import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
#adjusting the working directory to correspond to individual file path 
# Load the Drive helper and mount
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')
# After executing, drive files will be present in "/content/drive/My Drive".

Mounted at /content/drive


In [3]:
#Load the dataset
df = pd.read_csv('/content/drive/My Drive/IS460/encoded_df.csv')

In [4]:
#view the first 5 rows of the dataframe 
df.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,1,445412.0,1,709.0,1167493.0,8.0,1,5,5214.74,17.2,6.0,1.0,228190.0,416746.0,1.0,0.0
1,1,262328.0,1,724.0,1174162.0,1.0,1,3,33295.98,21.1,35.0,0.0,229976.0,850784.0,0.0,0.0
2,1,99999999.0,1,741.0,2231892.0,8.0,2,3,29200.53,14.9,18.0,1.0,297996.0,750090.0,0.0,0.0
3,1,347666.0,0,721.0,806949.0,3.0,2,3,8741.9,12.0,9.0,0.0,256329.0,386958.0,0.0,0.0
4,1,176220.0,1,724.0,1174162.0,5.0,3,3,20639.7,6.1,15.0,0.0,253460.0,427174.0,0.0,0.0


In [5]:
# Shape of df
df.shape

(92814, 16)

### Part 1 - Class imbalance techniques

In [6]:
#Dropping 5 columns which have VIF > 5
df1 = df.drop(['Annual Income', 'Number of Credit Problems', 
                     'Number of Open Accounts', 'Years of Credit History', 
                     'Current Credit Balance'], axis = 1)

#dropping rows with 99999999.0 as value under Current Loan Amount column
df1 = df1[df1["Current Loan Amount"]!=99999999.0]

In [7]:
df1.shape

(82389, 11)

In [8]:
#Set the target and features column
y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)

In [9]:
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

Base model

In [10]:
# base model without class imbalance handling
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       1.00      0.20      0.33      4308
           1       0.78      1.00      0.88     12170

    accuracy                           0.79     16478
   macro avg       0.89      0.60      0.60     16478
weighted avg       0.84      0.79      0.73     16478



SMOTE

In [11]:
#SMOTE

y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)
print("Before resampling: ", Counter(y_train))

smote = SMOTE(random_state = 42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

print("After resampling: ", Counter(y_resampled))

Before resampling:  Counter({1: 48677, 0: 17234})
After resampling:  Counter({1: 48677, 0: 48677})


In [12]:
#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       0.54      0.35      0.43      4308
           1       0.80      0.89      0.84     12170

    accuracy                           0.75     16478
   macro avg       0.67      0.62      0.63     16478
weighted avg       0.73      0.75      0.73     16478



ADASYN

In [13]:
#ADASYN

y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

print("Before resampling: ", Counter(y_train))

adasyn = ADASYN(random_state=42)

x_resampled, y_resampled = adasyn.fit_resample(x_train, y_train)

print("After resampling: ", Counter(y_resampled))

Before resampling:  Counter({1: 48677, 0: 17234})
After resampling:  Counter({1: 48677, 0: 46877})


In [14]:
#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       0.41      0.49      0.44      4308
           1       0.80      0.75      0.78     12170

    accuracy                           0.68     16478
   macro avg       0.61      0.62      0.61     16478
weighted avg       0.70      0.68      0.69     16478



ENN

In [15]:
#ENN
y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

print("Before resampling: ", Counter(y_train))

enn = EditedNearestNeighbours()

x_resampled, y_resampled = enn.fit_resample(x_train, y_train)

print("After resampling: ", Counter(y_resampled))


Before resampling:  Counter({1: 48677, 0: 17234})
After resampling:  Counter({1: 25614, 0: 17234})


In [16]:
#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       0.57      0.34      0.42      4308
           1       0.79      0.91      0.85     12170

    accuracy                           0.76     16478
   macro avg       0.68      0.62      0.64     16478
weighted avg       0.74      0.76      0.74     16478



SMOTEENN

In [17]:
#SMOTENN
y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

print("Before resampling: ", Counter(y_train))

smote_enn = SMOTEENN(random_state = 42)

x_resampled, y_resampled = smote_enn.fit_resample(x_train, y_train)

print("After resampling: ", Counter(y_resampled))


Before resampling:  Counter({1: 48677, 0: 17234})
After resampling:  Counter({0: 34360, 1: 22623})


In [18]:
#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       0.32      0.74      0.45      4308
           1       0.83      0.44      0.58     12170

    accuracy                           0.52     16478
   macro avg       0.57      0.59      0.51     16478
weighted avg       0.69      0.52      0.54     16478



SmoteTomek

In [19]:
#SMOTETomek

y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)

#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

print("Before resampling: ", Counter(y_train))

smote_tomek = SMOTETomek(random_state = 42)

x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)

print("After resampling: ", Counter(y_resampled))


Before resampling:  Counter({1: 48677, 0: 17234})
After resampling:  Counter({1: 47320, 0: 47320})


In [20]:
#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)
y_pred=model.predict(x_test)

report = classification_report(y_test, y_pred)

#Get model performance
print(report)

              precision    recall  f1-score   support

           0       0.54      0.35      0.43      4308
           1       0.80      0.89      0.84     12170

    accuracy                           0.75     16478
   macro avg       0.67      0.62      0.63     16478
weighted avg       0.73      0.75      0.73     16478



### Part 2- Adaboost

Applying adaptive boosting on the model.

In [21]:
#Initialise the base classifier
## ENN for class imbalance handling

y= df1["Loan Status"]
x = df1.drop(["Loan Status"],axis=1)
# Split dataset into 80% train, 20% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, 
                                                          stratify = y, random_state = 42)
#Standardising with Robust Scaler-- robust to outliers
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.transform(x_test)

enn = EditedNearestNeighbours()
x_resampled, y_resampled = enn.fit_resample(x_train, y_train)

#initialise base classifier then fit resampled data into it
model=DecisionTreeClassifier(criterion="gini",max_depth=5, random_state = 42)
model.fit(x_resampled,y_resampled)


DecisionTreeClassifier(max_depth=5, random_state=42)

In [22]:
#Adaboost the above model
adaboosted_model = AdaBoostClassifier(base_estimator= model, algorithm='SAMME', random_state= 42)
adaboosted_model.fit(x_resampled, y_resampled)
y_pred = adaboosted_model.predict(x_test)

report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.43      0.45      4308
           1       0.81      0.83      0.82     12170

    accuracy                           0.73     16478
   macro avg       0.64      0.63      0.64     16478
weighted avg       0.72      0.73      0.72     16478



### Part 3 - Hyperparameter tuning
Tuning booosted model's hyperparameters.

In [23]:
#Tune the adaboosted model
parameters = {'n_estimators' : [5, 10, 15, 20,25, 30, 35, 40, 45, 50, 55, 60],
              'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.5, 1]
              }

grid_search = GridSearchCV( estimator= adaboosted_model,
                     param_grid = parameters,
                     scoring = "f1",
                     cv = 10,
                     n_jobs = -1,
                     verbose = 20)

grid_search= grid_search.fit(x_resampled, y_resampled)
best_parameters = grid_search.best_params_

print("Best Paramters : ", best_parameters)

Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best Paramters :  {'learning_rate': 0.1, 'n_estimators': 40}


In [25]:
#Fitting hyperparameters onto adaboosted model
adaboosted_model_2 = AdaBoostClassifier(base_estimator= adaboosted_model, learning_rate= 0.1, n_estimators= 40, algorithm='SAMME', random_state= 42)
adaboosted_model_2.fit(x_resampled, y_resampled)
y_pred = adaboosted_model_2.predict(x_test)

report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.47      0.47      0.47      4308
           1       0.81      0.81      0.81     12170

    accuracy                           0.72     16478
   macro avg       0.64      0.64      0.64     16478
weighted avg       0.72      0.72      0.72     16478

