# Import Modules

In [254]:
# numpy,pandas,scipy,math,matplotlib
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

# Import Data

In [255]:
# Loading preprocessed dataset from previous EDA section
rawData = pd.read_csv('new_credit.csv')
rawData.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,Sep_Status,Aug_Status,July_Status,June_Status,May_Status,...,June_Bill,May_Bill,April_Bill,Sep_Paid,Aug_Paid,July_Paid,June_Paid,May_Paid,April_Paid,Default_Payment_Next_Month
0,20000,2,2,1,0,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,0,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,0,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,1,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,2,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [256]:
# To check the data type of each variable to make sure all data are numerical
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
LIMIT_BAL                      30000 non-null int64
SEX                            30000 non-null int64
EDUCATION                      30000 non-null int64
MARRIAGE                       30000 non-null int64
AGE                            30000 non-null int64
Sep_Status                     30000 non-null int64
Aug_Status                     30000 non-null int64
July_Status                    30000 non-null int64
June_Status                    30000 non-null int64
May_Status                     30000 non-null int64
April_Status                   30000 non-null int64
Sep_Bill                       30000 non-null int64
Aug_Bill                       30000 non-null int64
July_Bill                      30000 non-null int64
June_Bill                      30000 non-null int64
May_Bill                       30000 non-null int64
April_Bill                     30000 non-null int64
Sep_Pai

In [257]:
rawData.iloc[:,0:4]

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE
0,20000,2,2,1
1,120000,2,2,2
2,90000,2,2,2
3,50000,2,2,1
4,50000,1,2,1
...,...,...,...,...
29995,220000,1,3,1
29996,150000,1,3,2
29997,30000,1,2,2
29998,80000,1,3,1


# Build Training and Testing Sets

In [258]:
# Select the Independent Variables (except "Default_Payment_Next_Month ")
features = rawData.iloc[:,0:23]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,Sep_Status,Aug_Status,July_Status,June_Status,May_Status,...,July_Bill,June_Bill,May_Bill,April_Bill,Sep_Paid,Aug_Paid,July_Paid,June_Paid,May_Paid,April_Paid
0,20000,2,2,1,0,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,120000,2,2,2,0,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,2,2,2,0,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,1,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,2,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [259]:
# Select the Dependent Variable
depVar = rawData['Default_Payment_Next_Month ']

In [260]:
# Randomly split training and testing datasets by Sci-Kit Learn
from sklearn.model_selection import train_test_split

In [261]:
# Split into 4 Dataframes
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=0.25, random_state = 123)

In [262]:
# Check for X_train to see if it is randomly listed
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,Sep_Status,Aug_Status,July_Status,June_Status,May_Status,...,July_Bill,June_Bill,May_Bill,April_Bill,Sep_Paid,Aug_Paid,July_Paid,June_Paid,May_Paid,April_Paid
16095,140000,2,2,1,1,1,2,3,2,0,...,61459,59798,61287,8383,5200,0,0,3009,1000,94000
28548,210000,2,2,2,0,0,0,0,-2,-2,...,0,0,0,0,1000,0,0,0,0,0
25096,20000,1,3,2,2,-1,0,-1,-1,-1,...,390,18280,2880,1600,1105,390,18280,2880,1600,0
12260,90000,2,2,2,0,2,4,4,3,4,...,37825,40299,39093,38167,2000,0,3400,0,0,1000
21549,50000,2,3,2,0,-2,-2,-2,-2,-2,...,1697,0,0,5000,0,1699,0,0,5000,0


In [263]:
X_test.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,Sep_Status,Aug_Status,July_Status,June_Status,May_Status,...,July_Bill,June_Bill,May_Bill,April_Bill,Sep_Paid,Aug_Paid,July_Paid,June_Paid,May_Paid,April_Paid
25665,40000,2,2,2,0,-1,0,0,0,-1,...,13600,0,44024,18697,1300,1000,0,22373,680,10000
16464,80000,2,3,1,2,0,0,0,0,0,...,53169,50875,50372,49470,3212,2106,2000,1603,1903,2006
22386,170000,2,1,2,0,2,2,2,2,2,...,161487,157577,168094,170922,6800,6500,0,13000,5500,1000
10149,200000,2,2,1,1,-2,-2,-2,-2,-2,...,0,0,0,0,742,0,0,0,0,0
8729,50000,1,2,1,1,0,0,0,0,0,...,9044,9225,9417,9617,1140,1150,331,341,356,330


In [264]:
y_train.head()

16095    0
28548    0
25096    1
12260    0
21549    0
Name: Default_Payment_Next_Month , dtype: int64

In [265]:
y_test.head()

25665    0
16464    0
22386    1
10149    1
8729     0
Name: Default_Payment_Next_Month , dtype: int64

In [266]:
# Use the shape function to double check that the split was made as needed:
X_train.shape, X_test.shape

((22500, 23), (7500, 23))

# Model Building
## Using train_test_split Method

* Here we use five different classification classifiers 
 * Random Forest
 * Supportive Vector Machine (SVM) 
 * Gradient Boosting
 * Logistics Regression
 * Decision Trees

### Random Forest

In [163]:
from sklearn.ensemble import RandomForestClassifier

In [164]:
modelRF = RandomForestClassifier(n_estimators=100)

In [66]:
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [72]:
preds= modelRF.predict(X_test)

In [75]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,preds)

0.8128

In [67]:
modelRF.score(X_train,y_train)

0.9966222222222222

In [235]:
# fit the model and time it
import time

t0 = time.time()
modelRF.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
total

6.23421311378479

### Support Vector Machines (SVM)

In [76]:
from sklearn.svm import SVC

In [289]:
modelSVM = SVC(gamma='scale')

In [290]:
modelSVM.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [105]:
preds= modelSVM.predict(X_test)

In [106]:
accuracy_score(y_test,preds)

0.7853333333333333

In [107]:
modelSVM.score(X_train,y_train)

0.9899555555555556

In [236]:
# fit the model and time it
import time

t0 = time.time()
modelSVM.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
total

143.12159276008606

### Gradient Boosting 

In [110]:
from sklearn.ensemble import GradientBoostingClassifier

In [121]:
modelGB = GradientBoostingClassifier()

In [122]:
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [123]:
preds= modelGB.predict(X_test)

In [124]:
accuracy_score(y_test,preds)

0.8228

In [125]:
modelGB.score(X_train,y_train)

0.8258222222222222

In [237]:
# fit the model and time it
import time

t0 = time.time()
modelGB.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
total

3.8397457599639893

### Logistics Regression

In [267]:
from sklearn.linear_model import LogisticRegression

In [274]:
modelLR = LogisticRegression(solver='lbfgs')

In [275]:
modelLR.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [283]:
preds= modelLR.predict(X_test)

In [284]:
accuracy_score(y_test,preds)

0.7825333333333333

In [285]:
modelLR.score(X_train,y_train)

0.7773333333333333

In [239]:
# fit the model and time it
import time

t0 = time.time()
modelLR.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
total

0.2474079132080078

###  Decision Trees

In [147]:
from sklearn import tree

In [149]:
modelTREE = tree.DecisionTreeClassifier()

In [150]:
modelTREE.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [151]:
preds= modelLR.predict(X_test)

In [152]:
accuracy_score(y_test,preds)

0.7828

In [153]:
modelTREE.score(X_train,y_train)

0.9966666666666667

In [240]:
# fit the model and time it
import time

t0 = time.time()
modelTREE.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
total

0.5205094814300537

# Apply Cross Validation (another way)
  * Random Forest, SVM, and Decision Trees models are overfitting
  * We are going to use Cross Validation method to avoid overfitting

In [277]:
from sklearn.model_selection import cross_val_score

In [298]:
import warnings
warnings.filterwarnings("ignore")

In [194]:
# Random Forest cross_val_score with 3-fold validation
print(cross_val_score(RandomForestClassifier(n_estimators=100, random_state=0), X_train, y_train, cv=3))

[0.81069191 0.8172     0.81664222]


In [299]:
# Support Vector Machines cross_val_score
cross_val_score(SVC(random_state=0), X_train, y_train, cv=3)

array([0.77736302, 0.77826667, 0.77917056])

In [185]:
# Gradient Boosting cross_val_score
print(cross_val_score(GradientBoostingClassifier(n_estimators=100, random_state=0), X_train, y_train, cv=3))

[0.81629116 0.8216     0.82224297]


In [300]:
# Logistic Regression cross_val_score
print(cross_val_score(LogisticRegression(random_state=0), X_train, y_train, cv=3))

[0.7772297  0.77706667 0.77743699]


In [184]:
# Decision Trees cross_val_score
print(cross_val_score(tree.DecisionTreeClassifier(n_estimators=100, random_state=0), X_train, y_train, cv=3))

[0.71643781 0.72426667 0.72703027]


# Make Prediction and Evaluation
  * Gradient Boosting model shows the best result, and we will apply this model to our final prediction.
  * We will make the predictions in two ways.
    1. Use train_test_split method
    2. Use cross_val_predict() function
  * We will use confusion matrix, accuracy score, and classification report to evaluate the predictions.

#### Apply train_test_split method to gradient boosting model to make the predictions.

In [197]:
from sklearn.metrics import confusion_matrix

In [199]:
prediction_GB = modelGB.predict(X_test)
confusion_matrix(y_test, prediction_GB)

array([[5553,  320],
       [1009,  618]], dtype=int64)

In [202]:
accuracy_score(y_test,prediction_GB)

0.8228

In [213]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_test, prediction_GB)

0.3841614289753238

In [293]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_GB))

              precision    recall  f1-score   support

           0       0.85      0.95      0.89      5873
           1       0.66      0.38      0.48      1627

    accuracy                           0.82      7500
   macro avg       0.75      0.66      0.69      7500
weighted avg       0.81      0.82      0.80      7500



#### Use cross_val_predict to make the prediction

In [205]:
from sklearn.model_selection import cross_val_predict

In [207]:
prediction_GB_cv = cross_val_predict(modelGB, X_test, y_test, cv=3)
confusion_matrix(y_test, prediction_GB_cv)

array([[5552,  321],
       [1031,  596]], dtype=int64)

In [208]:
accuracy_score(y_test, prediction_GB_cv)

0.8197333333333333

In [214]:
cohen_kappa_score(y_test, prediction_GB_cv)

0.37003303039832924

In [292]:
print(classification_report(y_test, prediction_GB_cv))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5873
           1       0.65      0.37      0.47      1627

    accuracy                           0.82      7500
   macro avg       0.75      0.66      0.68      7500
weighted avg       0.80      0.82      0.80      7500



### We chose Gradient Boosting Model using train_test_split() function as our final model based on the highest accuracy score