# Evaluate Classification Models For Loan Default Prediction

In [1]:
#Import Pandas, NumPy, SciPy, Math, Matplotlib
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

#Import Estimators for Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

#Model metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

#Cross-Validation
from sklearn.model_selection import train_test_split

#Recursive Feature Elimination with Cross-validation
from sklearn.feature_selection import RFECV

In [2]:
#Import data
rawData = pd.read_csv('Preprocessed_DefaultData_v2.csv', header=1)
np.random.seed(0)
rawData.head()

Unnamed: 0,LIMITBAL_Small,LIMITBAL_Medium,LIMITBAL_Large,LIMITBAL_Super,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT2,BILL_AMT3,BILL_AMT4,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,0,0,0,2,2,-1,-1,-2,-2,...,3102,689,0,0,689,0,0,0,0,1
1,0,1,0,0,-1,2,0,0,0,2,...,1725,2682,3272,0,1000,1000,1000,0,2000,1
2,0,1,0,0,0,0,0,0,0,0,...,14027,13559,14331,1518,1500,1000,1000,1000,5000,0
3,1,0,0,0,0,0,0,0,0,0,...,48233,49291,28314,2000,2019,1200,1100,1069,1000,0
4,1,0,0,0,-1,0,-1,0,0,0,...,5670,35835,20940,2000,36681,10000,9000,689,679,0


In [3]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 21 columns):
LIMITBAL_Small     30000 non-null int64
LIMITBAL_Medium    30000 non-null int64
LIMITBAL_Large     30000 non-null int64
LIMITBAL_Super     30000 non-null int64
PAY_0              30000 non-null int64
PAY_2              30000 non-null int64
PAY_3              30000 non-null int64
PAY_4              30000 non-null int64
PAY_5              30000 non-null int64
PAY_6              30000 non-null int64
BILL_AMT1          30000 non-null int64
BILL_AMT2          30000 non-null int64
BILL_AMT3          30000 non-null int64
BILL_AMT4          30000 non-null int64
PAY_AMT1           30000 non-null int64
PAY_AMT2           30000 non-null int64
PAY_AMT3           30000 non-null int64
PAY_AMT4           30000 non-null int64
PAY_AMT5           30000 non-null int64
PAY_AMT6           30000 non-null int64
default            30000 non-null int64
dtypes: int64(21)
memory usage: 4.8 MB


In [4]:
#Select features for Classification
features = rawData.iloc[:,:20]
print('Summary of Feature Sample ')
features.head()

Summary of Feature Sample 


Unnamed: 0,LIMITBAL_Small,LIMITBAL_Medium,LIMITBAL_Large,LIMITBAL_Super,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,1,0,0,0,2,2,-1,-1,-2,-2,3913,3102,689,0,0,689,0,0,0,0
1,0,1,0,0,-1,2,0,0,0,2,2682,1725,2682,3272,0,1000,1000,1000,0,2000
2,0,1,0,0,0,0,0,0,0,0,29239,14027,13559,14331,1518,1500,1000,1000,1000,5000
3,1,0,0,0,0,0,0,0,0,0,46990,48233,49291,28314,2000,2019,1200,1100,1069,1000
4,1,0,0,0,-1,0,-1,0,0,0,8617,5670,35835,20940,2000,36681,10000,9000,689,679


In [5]:
#Select the dependent variable
depVar = rawData['default']

In [38]:
#Use Model Selection from sklearn to create cross validation on training sets with 70/30 split
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size = 0.3, random_state=1)

In [39]:
#Verifying the training, validation and test sets
X_test_count = len(X_test.index)
print('The number of observations in the X testing set are:',str(X_test_count))
y_train_count = len(y_train.index)
print('The number of observations in the Y training set are:',str(y_train_count))
y_train.head()

The number of observations in the X testing set are: 9000
The number of observations in the Y training set are: 21000


4936     0
4788     0
8447     1
4535     1
27563    0
Name: default, dtype: int64

In [40]:
X_train.shape, X_test.shape

((21000, 20), (9000, 20))

In [41]:
#Define variable names for base classifier Models
modelRF = RandomForestClassifier(max_depth=10, n_estimators=50)
modelKNN = KNeighborsClassifier(n_neighbors = 5, algorithm = "kd_tree")
modelDT = DecisionTreeClassifier(max_depth=10, splitter = "random")

In [42]:
#Train the base Random Forest classifier
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
#Train the base K-Nearest Neighbor Classifier
modelKNN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [44]:
#Train the base Decision Tree Classifier
modelDT.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')

In [13]:
#Estimate metrics for cross validation for base models
print('Cross validation scores for base models on training set \n')
print(cross_val_score(modelRF, X_train, y_train))
print(cross_val_score(modelKNN, X_train, y_train))
print(cross_val_score(modelDT, X_train, y_train))

Cross validation scores for base models on training set 





[0.82066667 0.82283333 0.814     ]




[0.76116667 0.75766667 0.76016667]
[0.80233333 0.819      0.80683333]




In [14]:
#Obtain base model scores on training sets
print('Model scores for all classifiers on training set \n')
print('Model score for Random Forest Classifier: %.5f' % modelRF.score(X_train,y_train))
print('Model score for k-Nearest Neighbor Classifier: %.5f' % modelKNN.score(X_train,y_train))
print('Model score for Decision Tree Classifier: %.5f' % modelDT.score(X_train,y_train))

Model scores for all classifiers on training set 

Model score for Random Forest Classifier: 0.85794
Model score for k-Nearest Neighbor Classifier: 0.81272
Model score for Decision Tree Classifier: 0.83883


In [45]:
#Obtain training set Kappa scores for all base models
print('Kappa scores for all base models on validation set \n')
print('Kappa score for Random Forest Classifier: %.5f' % cohen_kappa_score(y_train,modelRF.predict(X_train)))
print('Kappa score for k-Nearest Neighbor Classifier: %.5f' % cohen_kappa_score(y_train,modelKNN.predict(X_train)))
print('Kappa score for Decision Tree Classifier: %.5f' % cohen_kappa_score(y_train,modelDT.predict(X_train)))

Kappa scores for all base models on validation set 

Kappa score for Random Forest Classifier: 0.50030
Kappa score for k-Nearest Neighbor Classifier: 0.33891
Kappa score for Decision Tree Classifier: 0.43523


In [46]:
#Obtain training set Accuracy scores for all base models
print('Accuracy scores for all base models on validation set \n')
print('Accuracy score for Random Forest Classifier: %.5f' % accuracy_score(y_train,modelRF.predict(X_train)))
print('Accuracy score for k-Nearest Neighbor Classifier: %.5f' % accuracy_score(y_train,modelKNN.predict(X_train)))
print('Accuracy score for Decision Tree Classifier: %.5f' % accuracy_score(y_train,modelDT.predict(X_train)))

Accuracy scores for all base models on validation set 

Accuracy score for Random Forest Classifier: 0.85624
Accuracy score for k-Nearest Neighbor Classifier: 0.81262
Accuracy score for Decision Tree Classifier: 0.83600


In [47]:
#Obtain training set Precision, Recall, Fscore and support for all base models
print('Precision, Recall, Fscore and Support for all base models on validation set\n')
print('Precision, Recall, Fscore and Support for Random Forest Classifier:') 
print(precision_recall_fscore_support(y_train,modelRF.predict(X_train)))
print('Precision, Recall, Fscore and Support for k-Nearest Neighbor Classifier:')
print(precision_recall_fscore_support(y_train,modelKNN.predict(X_train)))
print('Precision, Recall, Fscore and Support for Decision Tree Classifier:')
print(precision_recall_fscore_support(y_train,modelDT.predict(X_train)))

Precision, Recall, Fscore and Support for all base models on validation set

Precision, Recall, Fscore and Support for Random Forest Classifier:
(array([0.86100741, 0.82121573]), array([0.97250061, 0.4458585 ]), array([0.91336413, 0.57793933]), array([16364,  4636], dtype=int64))
Precision, Recall, Fscore and Support for k-Nearest Neighbor Classifier:
(array([0.83334227, 0.64870598]), array([0.94940112, 0.32981018]), array([0.88759391, 0.43729444]), array([16364,  4636], dtype=int64))
Precision, Recall, Fscore and Support for Decision Tree Classifier:
(array([0.85143075, 0.7276547 ]), array([0.95642875, 0.41091458]), array([0.90088068, 0.52522746]), array([16364,  4636], dtype=int64))


In [48]:
#Predictions on test set using base random forest classifier
predictionsRF = modelRF.predict(X_test)
print('Accuracy score for Random Forest Classifier on test set: %.5f' % accuracy_score(y_test,predictionsRF))

Accuracy score for Random Forest Classifier on test set: 0.81733


In [49]:
#Improve model accuracy with RFE using base random forest classifier
rfecvRF = RFECV(estimator = modelRF, step = 1,cv=5, scoring = "accuracy")
rfecvRF.fit(X_train,y_train)
print("Optimal number of features : %d" % rfecvRF.n_features_)
print('Feature ranking: \n', rfecvRF.ranking_)

Optimal number of features : 17
Feature ranking: 
 [1 2 3 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [50]:
#Run predictions with optimal parameters on test set
rfecvRF.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [51]:
#Obtain accuracy of base Random forest classifier with test sets after RFE
print('Accuracy score for Random Forest Classifier on test set: %.5f' % accuracy_score(y_test,rfecvRF.predict(X_test)))

Accuracy score for Random Forest Classifier on test set: 0.81700


In [52]:
#Obtain confusion matrix for base Random Forest Classifier results
cmRF = confusion_matrix(y_test,modelRF.predict(X_test))
print(cmRF)

[[6668  332]
 [1312  688]]


## Model Parameter Tuning

In [53]:
#Define variable names for tuned classifier Models
modelRF2 = RandomForestClassifier(max_depth=None, n_estimators=100)
modelKNN2 = KNeighborsClassifier(n_neighbors = 3, algorithm = "auto")
modelDT2 = DecisionTreeClassifier(max_depth=None, splitter = "best")

In [54]:
#Train the tuned Random Forest classifier
modelRF2.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [55]:
#Train the tuned K-Nearest Neighbor Classifier
modelKNN2.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [56]:
#Train the tuned Decision Tree Classifier
modelDT2.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
#Estimate metrics for cross validation for tuned models
print('Cross validation scores for tuned models on training set \n')
print(cross_val_score(modelRF2, X_train, y_train))
print(cross_val_score(modelKNN2, X_train, y_train))
print(cross_val_score(modelDT2, X_train, y_train))

Cross validation scores for tuned models on training set 





[0.81166667 0.8155     0.81383333]




[0.73916667 0.73133333 0.748     ]




[0.72833333 0.721      0.72516667]


In [57]:
#Obtain tuned model scores on training sets
print('Model scores for all tuned classifiers on training set \n')
print('Model score for Random Forest Classifier: %.5f' % modelRF2.score(X_train,y_train))
print('Model score for k-Nearest Neighbor Classifier: %.5f' % modelKNN2.score(X_train,y_train))
print('Model score for Decision Tree Classifier: %.5f' % modelDT2.score(X_train,y_train))

Model scores for all tuned classifiers on training set 

Model score for Random Forest Classifier: 0.98814
Model score for k-Nearest Neighbor Classifier: 0.83933
Model score for Decision Tree Classifier: 0.98819


In [58]:
#Obtain training set Kappa scores for all tuned models
print('Kappa scores for all tuned models on validation set \n')
print('Kappa score for Random Forest Classifier: %.5f' % cohen_kappa_score(y_train,modelRF2.predict(X_train)))
print('Kappa score for k-Nearest Neighbor Classifier: %.5f' % cohen_kappa_score(y_train,modelKNN2.predict(X_train)))
print('Kappa score for Decision Tree Classifier: %.5f' % cohen_kappa_score(y_train,modelDT2.predict(X_train)))

Kappa scores for all tuned models on validation set 

Kappa score for Random Forest Classifier: 0.96492
Kappa score for k-Nearest Neighbor Classifier: 0.46908
Kappa score for Decision Tree Classifier: 0.96504


In [59]:
#Obtain training set Accuracy scores for all tuned models
print('Accuracy scores for all tuned models on validation set \n')
print('Accuracy score for Random Forest Classifier: %.5f' % accuracy_score(y_train,modelRF2.predict(X_train)))
print('Accuracy score for k-Nearest Neighbor Classifier: %.5f' % accuracy_score(y_train,modelKNN2.predict(X_train)))
print('Accuracy score for Decision Tree Classifier: %.5f' % accuracy_score(y_train,modelDT2.predict(X_train)))

Accuracy scores for all tuned models on validation set 

Accuracy score for Random Forest Classifier: 0.98814
Accuracy score for k-Nearest Neighbor Classifier: 0.83933
Accuracy score for Decision Tree Classifier: 0.98819


In [60]:
#Obtain training set Precision, Recall, Fscore and support for all tuned models
print('Precision, Recall, Fscore and Support for all tuned models on validation set\n')
print('Precision, Recall, Fscore and Support for Random Forest Classifier:') 
print(precision_recall_fscore_support(y_train,modelRF2.predict(X_train)))
print('Precision, Recall, Fscore and Support for k-Nearest Neighbor Classifier:')
print(precision_recall_fscore_support(y_train,modelKNN2.predict(X_train)))
print('Precision, Recall, Fscore and Support for Decision Tree Classifier:')
print(precision_recall_fscore_support(y_train,modelDT2.predict(X_train)))

Precision, Recall, Fscore and Support for all tuned models on validation set

Precision, Recall, Fscore and Support for Random Forest Classifier:
(array([0.98559634, 0.99773088]), array([0.9993889 , 0.94844694]), array([0.9924447 , 0.97246489]), array([16364,  4636], dtype=int64))
Precision, Recall, Fscore and Support for k-Nearest Neighbor Classifier:
(array([0.86240375, 0.70500325]), array([0.94451234, 0.46807593]), array([0.90159249, 0.56261343]), array([16364,  4636], dtype=int64))
Precision, Recall, Fscore and Support for Decision Tree Classifier:
(array([0.98548018, 0.99840981]), array([0.99957223, 0.94801553]), array([0.99247618, 0.9725603 ]), array([16364,  4636], dtype=int64))


In [61]:
#Predictions on test set using tuned random forest classifier
predictionsRF2 = modelRF2.predict(X_test)
print('Accuracy score for tuned Random Forest Classifier on test set: %.5f' % accuracy_score(y_test,predictionsRF2))

Accuracy score for tuned Random Forest Classifier on test set: 0.81011


In [62]:
#Improve model accuracy with RFE using tuned random forest classifier
rfecvRF2 = RFECV(estimator = modelRF2, step = 1,cv=5, scoring = "accuracy")
rfecvRF2.fit(X_train,y_train)
print("Optimal number of features : %d" % rfecvRF2.n_features_)

Optimal number of features : 20


In [63]:
#Rankings of features from RFE after tuning
print('Feature ranking: \n', rfecvRF2.ranking_)
#Run predictions with optimal parameters on test set
rfecvRF2.predict(X_test)

Feature ranking: 
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [64]:
#Obtain accuracy of tuned Random forest classifier with validation and test sets after RFE
print('Accuracy score for Random Forest Classifier on test set: %.5f' % accuracy_score(y_test,rfecvRF2.predict(X_test)))

Accuracy score for Random Forest Classifier on test set: 0.81167


In [65]:
#Obtain confusion matrix for base Random Forest Classifier results
cmRF2 = confusion_matrix(y_test,modelRF2.predict(X_test))
print(cmRF2)

[[6616  384]
 [1325  675]]
