In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

%matplotlib inline

In [2]:
# Load train and test files
training = pd.read_csv('train.csv', infer_datetime_format = True)
testing = pd.read_csv('test_bqCt9Pv.csv', infer_datetime_format = True)

In [3]:
training.head(5)

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [4]:
print('Shape of training data:', training.shape)
print('Shape of testing data:', testing.shape)

Shape of training data: (233154, 41)
Shape of testing data: (112392, 40)


In [5]:
# training.describe()
# training.info()

In [6]:
training.isna().sum() # Checking for NULL values

UniqueID                                  0
disbursed_amount                          0
asset_cost                                0
ltv                                       0
branch_id                                 0
supplier_id                               0
manufacturer_id                           0
Current_pincode_ID                        0
Date.of.Birth                             0
Employment.Type                        7661
DisbursalDate                             0
State_ID                                  0
Employee_code_ID                          0
MobileNo_Avl_Flag                         0
Aadhar_flag                               0
PAN_flag                                  0
VoterID_flag                              0
Driving_flag                              0
Passport_flag                             0
PERFORM_CNS.SCORE                         0
PERFORM_CNS.SCORE.DESCRIPTION             0
PRI.NO.OF.ACCTS                           0
PRI.ACTIVE.ACCTS                

Only Employment.Type has 7661 NULL values

In [7]:
# training.nunique()
training['Employment.Type'].value_counts()

Self employed    127635
Salaried          97858
Name: Employment.Type, dtype: int64

In [8]:
training['Employment.Type'] = training['Employment.Type'].fillna(training['Employment.Type'].mode()[0]) # Filling missing values

In [9]:
training.isna().sum()

UniqueID                               0
disbursed_amount                       0
asset_cost                             0
ltv                                    0
branch_id                              0
supplier_id                            0
manufacturer_id                        0
Current_pincode_ID                     0
Date.of.Birth                          0
Employment.Type                        0
DisbursalDate                          0
State_ID                               0
Employee_code_ID                       0
MobileNo_Avl_Flag                      0
Aadhar_flag                            0
PAN_flag                               0
VoterID_flag                           0
Driving_flag                           0
Passport_flag                          0
PERFORM_CNS.SCORE                      0
PERFORM_CNS.SCORE.DESCRIPTION          0
PRI.NO.OF.ACCTS                        0
PRI.ACTIVE.ACCTS                       0
PRI.OVERDUE.ACCTS                      0
PRI.CURRENT.BALA

In [10]:
training.loan_default.value_counts()  # Checking number of observations for each target label --> Imbalanced Dataset

0    182543
1     50611
Name: loan_default, dtype: int64

## Feature Engineering

In [11]:
# Adding Age
training['Date.of.Birth'] = pd.to_datetime(training['Date.of.Birth'], infer_datetime_format = True, format = '%d-%m-%y')
training['DisbursalDate'] = pd.to_datetime(training['DisbursalDate'], infer_datetime_format = True, format = '%d-%m-%y')
training['Age'] = (training['DisbursalDate'] - training['Date.of.Birth']).astype('<m8[Y]').astype(int)
training['Age'].head(5)

0 in training['Age'].values # Checking if any row has Age == 0

False

In [12]:
# Converting 00 yrs 00 months format to months 
training['AVERAGE.ACCT.AGE'] = training['AVERAGE.ACCT.AGE'].apply(lambda x: int(re.findall(r'\d+', x)[0])*12 + int(re.findall(r'\d+', x)[1]))
training['CREDIT.HISTORY.LENGTH'] = training['CREDIT.HISTORY.LENGTH'].apply(lambda x: int(re.findall(r'\d+', x)[0])*12 + int(re.findall(r'\d+', x)[1]))
training.head(5)

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default,Age
0,420825,50578,58400,89.55,67,22807,45,1441,1984-01-01,Salaried,...,0,0,0,0,0,0,0,0,0,34
1,537409,47145,65550,73.23,67,22807,45,1502,1985-07-31,Self employed,...,0,1991,0,0,1,23,23,0,1,33
2,417566,53278,61360,89.63,67,22807,45,1497,1985-08-24,Self employed,...,0,0,0,0,0,0,0,0,0,32
3,624493,57513,66113,88.48,67,22807,45,1501,1993-12-30,Self employed,...,0,31,0,0,0,8,15,1,1,24
4,539055,52378,60300,88.39,67,22807,45,1495,1977-12-09,Self employed,...,0,0,0,0,0,0,0,1,1,40


In [13]:
training['PERFORM_CNS.SCORE.DESCRIPTION'].unique()

array(['No Bureau History Available', 'I-Medium Risk', 'L-Very High Risk',
       'A-Very Low Risk',
       'Not Scored: Not Enough Info available on the customer',
       'D-Very Low Risk', 'M-Very High Risk', 'B-Very Low Risk',
       'C-Very Low Risk', 'E-Low Risk', 'H-Medium Risk', 'F-Low Risk',
       'K-High Risk',
       'Not Scored: No Activity seen on the customer (Inactive)',
       'Not Scored: Sufficient History Not Available',
       'Not Scored: No Updates available in last 36 months', 'G-Low Risk',
       'J-High Risk', 'Not Scored: Only a Guarantor',
       'Not Scored: More than 50 active Accounts found'], dtype=object)

In [14]:
# Downsizing the categorical values of PERFORM_CNS.SCORE.DESCRIPTION
training['PERFORM_CNS.SCORE.DESCRIPTION'].replace({'No Bureau History Available' : 'No History'}, inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'Not Scored: Not Enough Info available on the customer',
    'Not Scored: No Activity seen on the customer (Inactive)',
    'Not Scored: Sufficient History Not Available',
    'Not Scored: No Updates available in last 36 months',
    'Not Scored: Only a Guarantor',
    'Not Scored: More than 50 active Accounts found'],
    value = 'Not Scored', inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'A-Very Low Risk',
    'B-Very Low Risk',
    'C-Very Low Risk',
    'D-Very Low Risk'],
    value = 'Very Low Risk', inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'E-Low Risk',
    'F-Low Risk',
    'G-Low Risk'],
    value = 'Low Risk', inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'H-Medium Risk',
    'I-Medium Risk'],
    value = 'Medium Risk', inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'J-High Risk',
    'K-High Risk'],
    value = 'High Risk', inplace = True)

training['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'L-Very High Risk',
    'M-Very High Risk'],
    value = 'Very High Risk', inplace = True)

In [15]:
training['PERFORM_CNS.SCORE.DESCRIPTION'].unique()

array(['No History', 'Medium Risk', 'Very High Risk', 'Very Low Risk',
       'Not Scored', 'Low Risk', 'High Risk'], dtype=object)

In [16]:
training['TOTAL_NO_OF_ACCTS'] = training['PRI.NO.OF.ACCTS'] + training['SEC.NO.OF.ACCTS']
training['TOTAL_ACTIVE_ACCTS'] = training['PRI.ACTIVE.ACCTS'] + training['SEC.ACTIVE.ACCTS']
training['TOTAL_OVERDUE_ACCTS'] = training['PRI.OVERDUE.ACCTS'] + training['SEC.OVERDUE.ACCTS']
training['TOTAL_CURRENT_BALANCE'] = training['PRI.CURRENT.BALANCE'] + training['SEC.CURRENT.BALANCE']
training['TOTAL_SANCTIONED_AMOUNT'] = training['PRI.SANCTIONED.AMOUNT'] + training['SEC.SANCTIONED.AMOUNT']
training['TOTAL_DISBURSED_AMOUNT'] = training['PRI.DISBURSED.AMOUNT'] + training['SEC.DISBURSED.AMOUNT']
training['TOTAL_INSTALL_AMOUNT'] = training['PRIMARY.INSTAL.AMT'] + training['SEC.INSTAL.AMT']

In [17]:
# Labelling categorical data
cns_risk_label = {'No History' : 0, 'Not Scored' : 0, 'Very Low Risk' : 1, 'Low Risk' : 2, 
              'Medium Risk': 3, 'High Risk': 4, 'Very High Risk': 5}
employment_label = {'Self employed' : 0, 'Salaried' : 1}
training.loc[:,'employment_label'] = training.loc[:,'Employment.Type'].apply(lambda x: employment_label[x])
training.loc[:,'cns_risk_label'] = (training.loc[:,'PERFORM_CNS.SCORE.DESCRIPTION'].apply(lambda x: cns_risk_label[x]))

In [18]:
# Dropping unwanted columns
columns_to_drop = ['UniqueID', 'ltv', 'branch_id', 'supplier_id', 'manufacturer_id', 'Current_pincode_ID',
                   'Date.of.Birth', 'Employment.Type','DisbursalDate', 'State_ID', 'Employee_code_ID', 'PERFORM_CNS.SCORE.DESCRIPTION']

In [19]:
train_df = training.drop(columns = columns_to_drop)

In [20]:
train_df.head()

Unnamed: 0,disbursed_amount,asset_cost,MobileNo_Avl_Flag,Aadhar_flag,PAN_flag,VoterID_flag,Driving_flag,Passport_flag,PERFORM_CNS.SCORE,PRI.NO.OF.ACCTS,...,Age,TOTAL_NO_OF_ACCTS,TOTAL_ACTIVE_ACCTS,TOTAL_OVERDUE_ACCTS,TOTAL_CURRENT_BALANCE,TOTAL_SANCTIONED_AMOUNT,TOTAL_DISBURSED_AMOUNT,TOTAL_INSTALL_AMOUNT,employment_label,cns_risk_label
0,50578,58400,1,1,0,0,0,0,0,0,...,34,0,0,0,0,0,0,0,1,0
1,47145,65550,1,1,0,0,0,0,598,1,...,33,1,1,1,27600,50200,50200,1991,0,3
2,53278,61360,1,1,0,0,0,0,0,0,...,32,0,0,0,0,0,0,0,0,0
3,57513,66113,1,1,0,0,0,0,305,3,...,24,3,0,0,0,0,0,31,0,5
4,52378,60300,1,1,0,0,0,0,0,0,...,40,0,0,0,0,0,0,0,0,0


### Doing the same for testing data

In [21]:
testing = pd.read_csv('test_bqCt9Pv.csv', infer_datetime_format = True)

In [22]:
testing['Employment.Type'] = testing['Employment.Type'].fillna(testing['Employment.Type'].mode()[0]) # Filling missing values

testing['Date.of.Birth'] = pd.to_datetime(testing['Date.of.Birth'], infer_datetime_format = True, format = '%d-%m-%y')
testing['DisbursalDate'] = pd.to_datetime(testing['DisbursalDate'], infer_datetime_format = True, format = '%d-%m-%y')
testing['Age'] = (testing['DisbursalDate'] - testing['Date.of.Birth']).astype('<m8[Y]').astype(int)

testing['AVERAGE.ACCT.AGE'] = testing['AVERAGE.ACCT.AGE'].apply(lambda x: int(re.findall(r'\d+', x)[0])*12 + int(re.findall(r'\d+', x)[1]))
testing['CREDIT.HISTORY.LENGTH'] = testing['CREDIT.HISTORY.LENGTH'].apply(lambda x: int(re.findall(r'\d+', x)[0])*12 + int(re.findall(r'\d+', x)[1]))

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace({'No Bureau History Available' : 'No History'})

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'Not Scored: Not Enough Info available on the customer',
    'Not Scored: No Activity seen on the customer (Inactive)',
    'Not Scored: Sufficient History Not Available',
    'Not Scored: No Updates available in last 36 months',
    'Not Scored: Only a Guarantor',
    'Not Scored: More than 50 active Accounts found'],
    value = 'Not Scored')

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'A-Very Low Risk',
    'B-Very Low Risk',
    'C-Very Low Risk',
    'D-Very Low Risk'],
    value = 'Very Low Risk')

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'E-Low Risk',
    'F-Low Risk',
    'G-Low Risk'],
    value = 'Low Risk')

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'H-Medium Risk',
    'I-Medium Risk'],
    value = 'Medium Risk')

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'J-High Risk',
    'K-High Risk'],
    value = 'High Risk')

testing['PERFORM_CNS.SCORE.DESCRIPTION'] = testing['PERFORM_CNS.SCORE.DESCRIPTION'].replace(to_replace = [
    'L-Very High Risk',
    'M-Very High Risk'],
    value = 'Very High Risk')

testing['TOTAL_NO_OF_ACCTS'] = testing['PRI.NO.OF.ACCTS'] + testing['SEC.NO.OF.ACCTS']
testing['TOTAL_ACTIVE_ACCTS'] = testing['PRI.ACTIVE.ACCTS'] + testing['SEC.ACTIVE.ACCTS']
testing['TOTAL_OVERDUE_ACCTS'] = testing['PRI.OVERDUE.ACCTS'] + testing['SEC.OVERDUE.ACCTS']
testing['TOTAL_CURRENT_BALANCE'] = testing['PRI.CURRENT.BALANCE'] + testing['SEC.CURRENT.BALANCE']
testing['TOTAL_SANCTIONED_AMOUNT'] = testing['PRI.SANCTIONED.AMOUNT'] + testing['SEC.SANCTIONED.AMOUNT']
testing['TOTAL_DISBURSED_AMOUNT'] = testing['PRI.DISBURSED.AMOUNT'] + testing['SEC.DISBURSED.AMOUNT']
testing['TOTAL_INSTALL_AMOUNT'] = testing['PRIMARY.INSTAL.AMT'] + testing['SEC.INSTAL.AMT']

cns_risk_label = {'No History' : 0, 'Not Scored' : 0, 'Very Low Risk' : 1, 'Low Risk' : 2, 
              'Medium Risk': 3, 'High Risk': 4, 'Very High Risk': 5}
employment_label = {'Self employed' : 0, 'Salaried' : 1}

testing.loc[:,'employment_label'] = testing.loc[:,'Employment.Type'].apply(lambda x: employment_label[x])
testing.loc[:,'cns_risk_label'] = (testing.loc[:,'PERFORM_CNS.SCORE.DESCRIPTION'].apply(lambda x: cns_risk_label[x]))

In [23]:
test_df = testing.drop(columns = columns_to_drop)

In [24]:
print('Shape of training data:', train_df.shape)
print('Shape of testing data:', test_df.shape)

Shape of training data: (233154, 39)
Shape of testing data: (112392, 38)


In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233154 entries, 0 to 233153
Data columns (total 39 columns):
 #   Column                               Non-Null Count   Dtype
---  ------                               --------------   -----
 0   disbursed_amount                     233154 non-null  int64
 1   asset_cost                           233154 non-null  int64
 2   MobileNo_Avl_Flag                    233154 non-null  int64
 3   Aadhar_flag                          233154 non-null  int64
 4   PAN_flag                             233154 non-null  int64
 5   VoterID_flag                         233154 non-null  int64
 6   Driving_flag                         233154 non-null  int64
 7   Passport_flag                        233154 non-null  int64
 8   PERFORM_CNS.SCORE                    233154 non-null  int64
 9   PRI.NO.OF.ACCTS                      233154 non-null  int64
 10  PRI.ACTIVE.ACCTS                     233154 non-null  int64
 11  PRI.OVERDUE.ACCTS                    23

Since the data is varying for each column dramatically(E.g. asset_cost: 10^3 and PAN_flag: 1-0), we need to scale the data so the model is not thrown off with the differences in columns.

In [26]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,StratifiedKFold,cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score,roc_auc_score,roc_curve,auc

### Scaling Data

In [27]:
features = ['disbursed_amount', 'asset_cost', 'MobileNo_Avl_Flag','Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS','PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT','PRI.DISBURSED.AMOUNT',
'SEC.NO.OF.ACCTS','SEC.ACTIVE.ACCTS','SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT',
'PRIMARY.INSTAL.AMT','SEC.INSTAL.AMT','NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS','AVERAGE.ACCT.AGE',
'CREDIT.HISTORY.LENGTH','NO.OF_INQUIRIES','Age','TOTAL_NO_OF_ACCTS','TOTAL_ACTIVE_ACCTS','TOTAL_OVERDUE_ACCTS','TOTAL_CURRENT_BALANCE',
'TOTAL_SANCTIONED_AMOUNT','TOTAL_DISBURSED_AMOUNT', 'TOTAL_INSTALL_AMOUNT','employment_label','cns_risk_label']

# features = ['disbursed_amount', 'asset_cost', 'MobileNo_Avl_Flag','Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
# 'PERFORM_CNS.SCORE','NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS','AVERAGE.ACCT.AGE',
# 'CREDIT.HISTORY.LENGTH','NO.OF_INQUIRIES','Age','TOTAL_NO_OF_ACCTS','TOTAL_ACTIVE_ACCTS','TOTAL_OVERDUE_ACCTS','TOTAL_CURRENT_BALANCE',
# 'TOTAL_SANCTIONED_AMOUNT','TOTAL_DISBURSED_AMOUNT', 'TOTAL_INSTALL_AMOUNT','employment_label','cns_risk_label']

scaler = RobustScaler()

scaled_train = train_df.copy()
scaled_test = test_df.copy()

scaled_train[features] = scaler.fit_transform(train_df[features])
scaled_test[features] = scaler.fit_transform(test_df[features])

X = scaled_train[features]
y = scaled_train['loan_default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train: (186523, 38)
Shape of X_test: (46631, 38)
Shape of y_train: (186523,)
Shape of y_test: (46631,)


### Training the Model

In [28]:
def train_model(model):
    model = model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print('------', model, '------')
    print('Accuracy Score',accuracy_score(y_test, pred))
    print('Recall_score:',round(recall_score(y_test, pred),2))
    print('F1_score:',round(f1_score(y_test, pred),2))
    print('roc_auc_score:',round(roc_auc_score(y_test, pred), 2))
    print('Confusion_matrix')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('-'*50)
    return model

In [29]:
lr = train_model(LogisticRegression(max_iter = 150))
rfc = train_model(RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=1))
dtc = train_model(DecisionTreeClassifier(criterion="entropy", max_depth=3))
etc = train_model(ExtraTreesClassifier())

------ LogisticRegression(max_iter=150) ------
Accuracy Score 0.7847783663228325
Recall_score: 0.0
F1_score: 0.0
roc_auc_score: 0.5
Confusion_matrix
       0   1
0  36595  19
1  10017   0
--------------------------------------------------
------ RandomForestClassifier(max_depth=7, max_features=1, min_samples_split=25) ------
Accuracy Score 0.785185820591452
Recall_score: 0.0
F1_score: 0.0
roc_auc_score: 0.5
Confusion_matrix
       0  1
0  36614  0
1  10017  0
--------------------------------------------------
------ DecisionTreeClassifier(criterion='entropy', max_depth=3) ------
Accuracy Score 0.785185820591452
Recall_score: 0.0
F1_score: 0.0
roc_auc_score: 0.5
Confusion_matrix
       0  1
0  36614  0
1  10017  0
--------------------------------------------------
------ ExtraTreesClassifier() ------
Accuracy Score 0.7565567969805495
Recall_score: 0.08
F1_score: 0.13
roc_auc_score: 0.51
Confusion_matrix
       0     1
0  34442  2172
1   9180   837
---------------------------------------

Since it is an imbalanced dataset, the recall score is low and the model is biased towards detecting the loan_default as 0 even when the true value is 1.

In [30]:
train_df.loan_default.value_counts()

0    182543
1     50611
Name: loan_default, dtype: int64

### Balancing the data using SMOTE

In [31]:
# Oversampling using SMOTE
from imblearn.over_sampling import SMOTE
print('Class Label Distribution BEFORE Oversampling:')
print(train_df.loan_default.value_counts())
print('-'*50)

oversample = SMOTE(random_state = 2)
X, y = oversample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print('Class Label Distribution AFTER Oversampling:')
print(y.value_counts())

Class Label Distribution BEFORE Oversampling:
0    182543
1     50611
Name: loan_default, dtype: int64
--------------------------------------------------
Class Label Distribution AFTER Oversampling:
1    182543
0    182543
Name: loan_default, dtype: int64


The dataset is now balanced.

In [32]:
lr = train_model(LogisticRegression(max_iter = 100))
rfc = train_model(RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=1))
dtc = train_model(DecisionTreeClassifier(criterion="entropy", max_depth=3))
etc = train_model(ExtraTreesClassifier())

------ LogisticRegression() ------
Accuracy Score 0.5410994549289216
Recall_score: 0.38
F1_score: 0.45
roc_auc_score: 0.54
Confusion_matrix
       0      1
0  25814  10833
1  22675  13696
--------------------------------------------------
------ RandomForestClassifier(max_depth=7, max_features=1, min_samples_split=25) ------
Accuracy Score 0.6253800432770001
Recall_score: 0.72
F1_score: 0.66
roc_auc_score: 0.63
Confusion_matrix
       0      1
0  19469  17178
1  10176  26195
--------------------------------------------------
------ DecisionTreeClassifier(criterion='entropy', max_depth=3) ------
Accuracy Score 0.5721876797501986
Recall_score: 0.67
F1_score: 0.61
roc_auc_score: 0.57
Confusion_matrix
       0      1
0  17364  19283
1  11955  24416
--------------------------------------------------
------ ExtraTreesClassifier() ------
Accuracy Score 0.8425045879098304
Recall_score: 0.83
F1_score: 0.84
roc_auc_score: 0.84
Confusion_matrix
       0      1
0  31500   5147
1   6353  30018
----

Extra Trees Classifier gives better recall and roc-auc score with better accuracy.

### Predicting on test data

In [33]:
y_pred = etc.predict(scaled_test[features])
prediction = pd.DataFrame({'predicted_loan_default':y_pred})

In [34]:
prediction_df = pd.DataFrame(data = testing['UniqueID'], columns = ['UniqueID'])
prediction_df['predicted_loan_default'] = prediction['predicted_loan_default']
prediction_df.to_csv('test_prediction.csv', index = False)
prediction_df.head()

Unnamed: 0,UniqueID,predicted_loan_default
0,655269,0
1,723482,0
2,758529,0
3,763449,0
4,708663,1
