In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("./data/pre_processed_credit.csv")
df.head(2)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,loan_to_income_ratio,loan_to_emp_length_ratio,...,medium,large,very large,A,B,C,D,E,F,G
0,21,9600,5.0,1000,11.14,0,0.1,2,0.104167,0.005,...,0,0,0,0,1,0,0,0,0,0
1,25,9600,1.0,5500,12.87,1,0.57,3,0.572917,0.000182,...,1,0,0,0,0,1,0,0,0,0


In [5]:
df.columns

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_cred_hist_length', 'loan_to_income_ratio',
       'loan_to_emp_length_ratio', 'int_rate_to_loan_amt_ratio', 'MORTGAGE',
       'OTHER', 'OWN', 'RENT', 'DEBTCONSOLIDATION', 'EDUCATION',
       'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE', 'N', 'Y', '20-25',
       '26-35', '36-45', '46-55', '56-65', 'low', 'low-middle', 'middle',
       'high-middle', 'high', 'small', 'medium', 'large', 'very large', 'A',
       'B', 'C', 'D', 'E', 'F', 'G'],
      dtype='object')

In [6]:
y = df["loan_status"]

In [7]:
df.drop(['loan_status'],  axis =1 ,inplace=True)

In [8]:
df.columns

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'loan_to_income_ratio', 'loan_to_emp_length_ratio',
       'int_rate_to_loan_amt_ratio', 'MORTGAGE', 'OTHER', 'OWN', 'RENT',
       'DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL',
       'PERSONAL', 'VENTURE', 'N', 'Y', '20-25', '26-35', '36-45', '46-55',
       '56-65', 'low', 'low-middle', 'middle', 'high-middle', 'high', 'small',
       'medium', 'large', 'very large', 'A', 'B', 'C', 'D', 'E', 'F', 'G'],
      dtype='object')

# SMOTE

In [10]:
from collections import Counter
from imblearn.over_sampling import SMOTE
print('Original dataset shape {}'.format(Counter(y)))
smt = SMOTE(random_state=20)
train_input_new, train_output_new = smt.fit_resample(df, y)
print('New dataset shape {}'.format(Counter(train_output_new)))
train_input_new = pd.DataFrame(train_input_new, columns = list(df.columns))
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=42)

Original dataset shape Counter({0: 22428, 1: 6202})
New dataset shape Counter({0: 22428, 1: 22428})


In [11]:
X_train.shape

(19182, 43)

In [12]:
X_test.shape

(9448, 43)

In [13]:
y_train.shape

(19182,)

In [14]:
y_test.shape

(9448,)

# Normalize data

In [15]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [16]:
X_train = X_train_norm
X_test = X_test_norm

# LOGISTIC REGRESSION

In [17]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(penalty= 'l1', solver='saga')

In [18]:
param_grid = [    
    {#'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20)
    }
]

In [19]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(LR, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [20]:
best_clf = clf.fit(X_train,y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits




In [21]:
best_clf.best_estimator_

LogisticRegression(C=1438.44988828766, penalty='l1', solver='saga')

In [22]:
print (f'Accuracy: {best_clf.score(X_train,y_train):.3f}')

Accuracy: 0.879


In [23]:
y_pred_LR = best_clf.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_LR)

array([[7016,  349],
       [ 860, 1223]], dtype=int64)

In [25]:
from sklearn.metrics import classification_report
print("Classification report - \n", classification_report(y_test,y_pred_LR))

Classification report - 
               precision    recall  f1-score   support

           0       0.89      0.95      0.92      7365
           1       0.78      0.59      0.67      2083

    accuracy                           0.87      9448
   macro avg       0.83      0.77      0.79      9448
weighted avg       0.87      0.87      0.87      9448



### penalty= 'l2', solver='saga'

In [26]:
LR = LogisticRegression(penalty= 'l2', solver='liblinear')
LR.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [27]:
y_pred_LR = LR.predict(X_test)

In [28]:
confusion_matrix(y_test,y_pred_LR)

array([[7016,  349],
       [ 890, 1193]], dtype=int64)

In [29]:
print("Classification report - \n", classification_report(y_test,y_pred_LR))

Classification report - 
               precision    recall  f1-score   support

           0       0.89      0.95      0.92      7365
           1       0.77      0.57      0.66      2083

    accuracy                           0.87      9448
   macro avg       0.83      0.76      0.79      9448
weighted avg       0.86      0.87      0.86      9448



In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_LR)

0.868861134631668

# Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for visualizing tree 

In [32]:
# Defining the decision tree algorithm
dtree=DecisionTreeClassifier(max_depth=100)
dtree.fit(X_train,y_train)
print('Decision Tree Classifier Created')

Decision Tree Classifier Created


In [33]:
# Predicting the values of test data
y_pred_DT = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred_DT))

Classification report - 
               precision    recall  f1-score   support

           0       0.93      0.92      0.93      7365
           1       0.73      0.77      0.75      2083

    accuracy                           0.89      9448
   macro avg       0.83      0.84      0.84      9448
weighted avg       0.89      0.89      0.89      9448



In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_DT)

array([[6778,  587],
       [ 486, 1597]], dtype=int64)

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_DT)

0.8864309906858594

# RANDOM FOREST

In [36]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=50)

#RF.fit(X_train_scaled,y_train)
RF.fit(X_train,y_train)

RandomForestClassifier(n_estimators=50)

In [37]:
#y_pred_RF = RF.predict(X_test_scaled)
y_pred_RF = RF.predict(X_test)

In [38]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_RF)

array([[7302,   63],
       [ 600, 1483]], dtype=int64)

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_RF)

0.9298264182895851

In [40]:
print("Classification report - \n", classification_report(y_test,y_pred_RF))

Classification report - 
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      7365
           1       0.96      0.71      0.82      2083

    accuracy                           0.93      9448
   macro avg       0.94      0.85      0.89      9448
weighted avg       0.93      0.93      0.93      9448



# Neural Network

In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import keras

In [42]:
# define the keras model
import tensorflow as tf

optimizer = tf.keras.optimizers.Adam(lr=0.001)
model = Sequential()
model.add(Dense(32, input_dim=43, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=200, batch_size=1000)
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/200


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Accuracy: 91.81


In [43]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)

In [44]:
loss

0.2327815741300583

In [45]:
acc

0.9180778861045837

In [46]:
predict_x=model.predict(X_test) 
yhat_classes=np.argmax(predict_x,axis=1)

In [47]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
print("Accuracy: %f" % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes, average='weighted', labels=np.unique(yhat_classes))
print("Precision: %f" % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes, average='weighted', labels=np.unique(yhat_classes))
print("Recall: %f" % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes, average='weighted', labels=np.unique(yhat_classes))
print("F1 score: %f" % f1)

# confusion matrix
matrix = confusion_matrix(y_test, yhat_classes)
print(matrix)

Accuracy: 0.779530
Precision: 0.779530
Recall: 1.000000
F1 score: 0.876108
[[7365    0]
 [2083    0]]
