In [1]:
#Importing the library
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler

#import imblearn
#from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency


import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the training and testing data
df_train = pd.read_csv('E:/Data Science/NBFC Loan Default/Train_set_(1)_(1).csv')
df_test = pd.read_csv('E:/Data Science/NBFC Loan Default/Test_set_(1)_(2).csv')

# Data Exploration

In [3]:
#Checking the first 5 rows
df_train.head()

Unnamed: 0,ID,loan_amnt,loan_term,interest_rate,loan_grade,loan_subgrade,job_experience,home_ownership,annual_income,income_verification_status,...,delinq_2yrs,public_records,revolving_balance,total_acc,interest_receive,application_type,last_week_pay,total_current_balance,total_revolving_limit,default
0,72199369,9000,3 years,9.17,B,B2,<5 Years,OWN,85000.0,Not Verified,...,0.0,0.0,39519,20.0,59.6,INDIVIDUAL,4.0,95493.0,84100.0,0
1,14257956,18000,3 years,13.65,C,C1,<5 Years,OWN,64000.0,Verified,...,0.0,1.0,9783,24.0,3348.25,INDIVIDUAL,95.0,185433.0,13500.0,0
2,66216451,16000,3 years,7.26,A,A4,<5 Years,MORTGAGE,150000.0,Source Verified,...,2.0,0.0,13641,27.0,276.69,INDIVIDUAL,13.0,180519.0,19300.0,0
3,46974169,25000,3 years,13.99,C,C4,,MORTGAGE,59800.0,Verified,...,0.0,0.0,35020,35.0,1106.72,INDIVIDUAL,17.0,183208.0,55400.0,0
4,46725961,17000,3 years,6.39,A,A2,10+ years,MORTGAGE,72000.0,Source Verified,...,0.0,0.0,23990,26.0,725.29,INDIVIDUAL,39.0,23990.0,81300.0,0


In [4]:
#Checking the shape
df_train.shape

(93174, 23)

In [5]:
#Checking the info
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93174 entries, 0 to 93173
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          93174 non-null  int64  
 1   loan_amnt                   93174 non-null  int64  
 2   loan_term                   93174 non-null  object 
 3   interest_rate               93174 non-null  float64
 4   loan_grade                  93174 non-null  object 
 5   loan_subgrade               93174 non-null  object 
 6   job_experience              88472 non-null  object 
 7   home_ownership              93174 non-null  object 
 8   annual_income               93173 non-null  float64
 9   income_verification_status  93174 non-null  object 
 10  loan_purpose                93174 non-null  object 
 11  state_code                  93174 non-null  object 
 12  debt_to_income              93174 non-null  float64
 13  delinq_2yrs                 931

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39933 entries, 0 to 39932
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          39933 non-null  int64  
 1   loan_amnt                   39933 non-null  int64  
 2   loan_term                   39933 non-null  object 
 3   interest_rate               39933 non-null  float64
 4   loan_grade                  39933 non-null  object 
 5   loan_subgrade               39933 non-null  object 
 6   job_experience              37844 non-null  object 
 7   home_ownership              39933 non-null  object 
 8   annual_income               39933 non-null  float64
 9   income_verification_status  39933 non-null  object 
 10  loan_purpose                39933 non-null  object 
 11  state_code                  39933 non-null  object 
 12  debt_to_income              39933 non-null  float64
 13  delinq_2yrs                 399

# Data Cleaning 

In [7]:
#Checking duplicate values
df_train.duplicated().sum()

0

In [8]:
#Checking missing values
df_train.isnull().sum()

ID                               0
loan_amnt                        0
loan_term                        0
interest_rate                    0
loan_grade                       0
loan_subgrade                    0
job_experience                4702
home_ownership                   0
annual_income                    1
income_verification_status       0
loan_purpose                     0
state_code                       0
debt_to_income                   0
delinq_2yrs                      2
public_records                   2
revolving_balance                0
total_acc                        2
interest_receive                 0
application_type                 0
last_week_pay                 1924
total_current_balance         7386
total_revolving_limit         7386
default                          0
dtype: int64

In [9]:
#dropping the ID column
df_train = df_train.drop(['ID','state_code','loan_grade'],axis=1)

In [10]:
#Encode categorical variable
cat_col = ['loan_term','loan_subgrade','job_experience','home_ownership','income_verification_status','loan_purpose','application_type']
df_train['loan_term'] = df_train.loan_term.map({'3 years':0,'5 years':1})
df_train['loan_subgrade'] = df_train.loan_subgrade.map({'A1':0,'A2':1,'A3':2,'A4':3,'A5':4,'B1':5,'B2':6,'B3':7,'B4':8,'B5':9,'C1':10,'C2':11,'C3':12,'C4':13,'C5':14,
                                                    'D1':15,'D2':16,'D3':17,'D4':18,'D5':19,'E1':20,'E2':21,'E3':22,'E4':23,'E5':24,'F1':25,'F2':26,'F3':27,'F4':28,'F5':29,
                                                'G1':30,'G2':31,'G3':32,'G4':33,'G5':34})
df_train['job_experience'] =df_train.job_experience.map({'<5 Years':0,'6-10 years':1,'10+ years':2})
df_train['home_ownership'] = df_train.home_ownership.map({'MORTGAGE':0,'RENT':1,'OWN':2,'OTHER':3,'NONE':4})
df_train['income_verification_status'] = df_train.income_verification_status.map({'Source Verified':0,'Verified':1,'Not Verified':2})
df_train['loan_purpose'] = df_train.loan_purpose.map({'debt_consolidation':0,'credit_card':1,'other':2,'home_improvement':3})
df_train['application_type'] = df_train.application_type.map({'INDIVIDUAL':0,'JOINT':1})

In [11]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=7)
df_train[['annual_income','delinq_2yrs','public_records','total_acc','last_week_pay','total_current_balance']] = imputer.fit_transform(X=df_train[['annual_income','delinq_2yrs','public_records','total_acc','last_week_pay','total_current_balance']])

# EDA

In [12]:
#Dropping total_revolving_limit
df_train = df_train.drop('total_revolving_limit',axis=1)
df_test = df_test.drop('total_revolving_limit',axis=1)

In [13]:
#Chi Square test for all categorical variable
cat_col = ['loan_term','loan_subgrade','job_experience','home_ownership','income_verification_status','loan_purpose',
           'application_type']
for col in cat_col:
    cross_tab = pd.crosstab(df_train[col],df_train['default'])
    chi2, p, dof, expected = chi2_contingency(cross_tab)
    
    if p < 0.05:
        print('For',col,': p =',p,',Since p < 0.05, There is a relationship between',col,'and default.')
    else:
        print('For',col,': p =',p,',Since p > 0.05, There is no relationship between',col,'and default.')

For loan_term : p = 0.0 ,Since p < 0.05, There is a relationship between loan_term and default.
For loan_subgrade : p = 7.20260619090703e-100 ,Since p < 0.05, There is a relationship between loan_subgrade and default.
For job_experience : p = 3.139835375705397e-23 ,Since p < 0.05, There is a relationship between job_experience and default.
For home_ownership : p = 1.8474415714935548e-19 ,Since p < 0.05, There is a relationship between home_ownership and default.
For income_verification_status : p = 5.615080262506217e-183 ,Since p < 0.05, There is a relationship between income_verification_status and default.
For loan_purpose : p = 2.1886850683537214e-89 ,Since p < 0.05, There is a relationship between loan_purpose and default.
For application_type : p = 5.803574384959491e-05 ,Since p < 0.05, There is a relationship between application_type and default.


In [14]:
#Calculating the number of outlier in each feature of the dataset
des = df_train.describe()
for col in des.columns:
    q1 = des.loc['25%',col]
    q3 = des.loc['75%',col]
    iqr = q3-q1
    upper_fence = q3 + 1.5*iqr
    lower_fence = q1 -1.5*iqr
    print("Number of outlier in", col ,"feature is", 
          len(df_train[col][(df_train[col]>upper_fence) | (df_train[col]<lower_fence)]))

Number of outlier in loan_amnt feature is 0
Number of outlier in loan_term feature is 0
Number of outlier in interest_rate feature is 610
Number of outlier in loan_subgrade feature is 785
Number of outlier in job_experience feature is 0
Number of outlier in home_ownership feature is 26
Number of outlier in annual_income feature is 4201
Number of outlier in income_verification_status feature is 0
Number of outlier in loan_purpose feature is 5391
Number of outlier in debt_to_income feature is 9
Number of outlier in delinq_2yrs feature is 17858
Number of outlier in public_records feature is 14238
Number of outlier in revolving_balance feature is 5044
Number of outlier in total_acc feature is 1950
Number of outlier in interest_receive feature is 6766
Number of outlier in application_type feature is 56
Number of outlier in last_week_pay feature is 618
Number of outlier in total_current_balance feature is 3267
Number of outlier in default feature is 22129


In [15]:
#Imputing outliers 
des1 = df_train.describe()
columns=['interest_rate','annual_income','debt_to_income','delinq_2yrs','public_records','revolving_balance','total_acc','interest_receive',
         'last_week_pay','total_current_balance']
for col in columns:
    q1 = des1.loc['25%',col]
    q3 = des1.loc['75%',col]
    iqr = q3-q1
    upper_fence = q3 + 1.5*iqr
    lower_fence = q1 -1.5*iqr
    df_include = df_train.loc[(df_train[col] >= lower_fence) & (df_train[col] <= upper_fence)]
    mean = df_include[col].mean()
    df_train[col]=np.where(df_train[col]>upper_fence,upper_fence,df_train[col])
    df_train[col]=np.where(df_train[col] < lower_fence, lower_fence,df_train[col])
    print("Number of outlier after imputing outlier with mean in", col ,"feature is", 
          len(df_train[col][(df_train[col]>upper_fence) | (df_train[col]<lower_fence)]))

Number of outlier after imputing outlier with mean in interest_rate feature is 0
Number of outlier after imputing outlier with mean in annual_income feature is 0
Number of outlier after imputing outlier with mean in debt_to_income feature is 0
Number of outlier after imputing outlier with mean in delinq_2yrs feature is 0
Number of outlier after imputing outlier with mean in public_records feature is 0
Number of outlier after imputing outlier with mean in revolving_balance feature is 0
Number of outlier after imputing outlier with mean in total_acc feature is 0
Number of outlier after imputing outlier with mean in interest_receive feature is 0
Number of outlier after imputing outlier with mean in last_week_pay feature is 0
Number of outlier after imputing outlier with mean in total_current_balance feature is 0


# Data Preparation

In [16]:
x=df_train.drop('default',axis=1)
y=df_train['default'].values

In [17]:
x = pd.get_dummies(data=x,columns=cat_col)

In [18]:
sc = StandardScaler()
x = sc.fit_transform(x)

# Testing Data Preparation

In [19]:
#Encode categorical variable
cat_col = ['loan_term','loan_subgrade','job_experience','home_ownership','income_verification_status','loan_purpose','application_type']
df_test['loan_term'] = df_test.loan_term.map({'3 years':0,'5 years':1})
df_test['loan_subgrade'] = df_test.loan_subgrade.map({'A1':0,'A2':1,'A3':2,'A4':3,'A5':4,'B1':5,'B2':6,'B3':7,'B4':8,'B5':9,'C1':10,'C2':11,'C3':12,'C4':13,'C5':14,
                                                    'D1':15,'D2':16,'D3':17,'D4':18,'D5':19,'E1':20,'E2':21,'E3':22,'E4':23,'E5':24,'F1':25,'F2':26,'F3':27,'F4':28,'F5':29,
                                                'G1':30,'G2':31,'G3':32,'G4':33,'G5':34})
df_test['job_experience'] = df_test.job_experience.map({'<5 Years':0,'6-10 years':1,'10+ years':2})
df_test['home_ownership'] = df_test.home_ownership.map({'MORTGAGE':0,'RENT':1,'OWN':2,'OTHER':3,'NONE':4})
df_test['income_verification_status'] = df_test.income_verification_status.map({'Source Verified':0,'Verified':1,'Not Verified':2})
df_test['loan_purpose'] = df_test.loan_purpose.map({'debt_consolidation':0,'credit_card':1,'other':2,'home_improvement':3})
df_test['application_type'] = df_test.application_type.map({'INDIVIDUAL':0,'JOINT':1})

In [20]:
df_test[['annual_income','delinq_2yrs','public_records','total_acc','last_week_pay','total_current_balance']] = imputer.transform(X=df_test[['annual_income','delinq_2yrs','public_records','total_acc','last_week_pay','total_current_balance']])

In [21]:
x_sample = df_test.drop(['ID','state_code','loan_grade'],axis=1)

In [22]:
x_sample = pd.get_dummies(data=x_sample, columns=cat_col)

In [23]:
x_sample = sc.transform(x_sample)

# Training Neural Networks

In [33]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=512,activation = 'relu',kernel_initializer='he_uniform'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(units=264,activation = 'relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.35))
model.add(tf.keras.layers.Dense(units=128,activation = 'relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units=64,activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units=1,activation = 'sigmoid'))

In [34]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['Accuracy','Precision','Recall'])

In [35]:
model.fit(x,y,batch_size=224,epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2b2179d7a60>

In [36]:
model.save('model.h5')

In [37]:
ypred = model.predict_classes(x_sample)

In [38]:
df_test['default'] = ypred

In [39]:
sub = df_test[['ID','default']]

In [40]:
sub.to_csv('submission_2.csv',index=False)

In [76]:
#Creating function to trained the model
def train_test(model, x_train,y_train,x_test,y_test):
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    print('\nTraining Accuracy: ',model.score(x_train,y_train))
    print('Testing Accuracy: ',model.score(x_test,y_test))
    
    
    print('\nConfusion matrix for training set:\n')
    model_cm_train = confusion_matrix(y_train, y_train_pred)
    sns.heatmap(model_cm_train, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    print('\nConfusion matrix for testing set:\n')
    model_cm_test = confusion_matrix(y_test, y_test_pred)
    sns.heatmap(model_cm_test, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"] )
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    print('************************************************************************')
    print('\nClassification report for Training set:\n')
    print(classification_report(y_train,y_train_pred))
    
    print('\nClassification report for Testing set:\n')
    print(classification_report(y_test,y_test_pred))
    print('************************************************************************')

In [78]:
x_train,x_test,y_train,y_test =train_test_split(x,y,train_size=.8,random_state=1)

In [None]:
#Model Training 
lr = SVC()
train_test(lr,x_train, y_train, x_test, y_test)