### Imports

In [90]:
#numpy, pandas, scipy, math, matplotlib, time
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt
from time import time

#preprocessing/feature selection
from sklearn.feature_selection import VarianceThreshold

#estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

#model metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#cross validation/tuning
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

# Set random seed
np.random.seed(0)

### Data Import

In [91]:
#data
rawData = pd.read_csv('cleandata.csv')
rawData.head()

Unnamed: 0,ID,limit,sex,education,marriage,age,RepayStatSep2005,RepayStatAug2005,RepayStatJul2005,RepayStatJun2005,...,AmtBillStmtMay2005,AmtBillStmtApr2005,AmtPrevPaySep2005,AmtPrevPayAug2005,AmtPrevPayJul2005,AmtPrevPayJun2005,AmtPrevPayMay2005,AmtPrevPayApr2005,default,AgeBracket
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,18-24
1,2,120000,2,2,2,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,25-34
2,3,90000,2,2,2,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,25-34
3,4,50000,2,2,1,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,35-44
4,5,50000,1,2,1,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,55+


### Data Preparation

In [140]:
#Print Corrmatrix and covariance matrix to check later if overfitt is suspected
#corrMat = rawData.corr()
#corrMat.to_csv('corrMat.csv')

In [10]:
#covMat = rdyData.cov()
#covMat.to_csv('covMat.csv')

In [92]:
#Convert AgeBracket from categorical data type to binary columns
dummy = pd.get_dummies(rawData['AgeBracket'])
dummy.head()

Unnamed: 0,18-24,25-34,35-44,45-54,55+
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,0,0,1


In [93]:
#Merge new binary columns to dataset
rdyData = pd.concat([dummyAge,rawData,], axis=1)
rdyData.head()

Unnamed: 0,18-24,25-34,35-44,45-54,55+,ID,limit,sex,education,marriage,...,AmtBillStmtMay2005,AmtBillStmtApr2005,AmtPrevPaySep2005,AmtPrevPayAug2005,AmtPrevPayJul2005,AmtPrevPayJun2005,AmtPrevPayMay2005,AmtPrevPayApr2005,default,AgeBracket
0,1,0,0,0,0,1,20000,2,2,1,...,0,0,0,689,0,0,0,0,1,18-24
1,0,1,0,0,0,2,120000,2,2,2,...,3455,3261,0,1000,1000,1000,0,2000,1,25-34
2,0,1,0,0,0,3,90000,2,2,2,...,14948,15549,1518,1500,1000,1000,1000,5000,0,25-34
3,0,0,1,0,0,4,50000,2,2,1,...,28959,29547,2000,2019,1200,1100,1069,1000,0,35-44
4,0,0,0,0,1,5,50000,1,2,1,...,19146,19131,2000,36681,10000,9000,689,679,0,55+


In [94]:
#Remove original AgeBracket column and ID column as they are not needed for analysis
rdyData.drop(['ID', 'AgeBracket'], axis=1, inplace=True)
rdyData.head()

Unnamed: 0,18-24,25-34,35-44,45-54,55+,limit,sex,education,marriage,age,...,AmtBillStmtJun2005,AmtBillStmtMay2005,AmtBillStmtApr2005,AmtPrevPaySep2005,AmtPrevPayAug2005,AmtPrevPayJul2005,AmtPrevPayJun2005,AmtPrevPayMay2005,AmtPrevPayApr2005,default
0,1,0,0,0,0,20000,2,2,1,24,...,0,0,0,0,689,0,0,0,0,1
1,0,1,0,0,0,120000,2,2,2,26,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,0,1,0,0,0,90000,2,2,2,34,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,0,0,1,0,0,50000,2,2,1,37,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,0,0,0,0,1,50000,1,2,1,57,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [95]:
rdyData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27963 entries, 0 to 27962
Data columns (total 29 columns):
18-24                 27963 non-null uint8
25-34                 27963 non-null uint8
35-44                 27963 non-null uint8
45-54                 27963 non-null uint8
55+                   27963 non-null uint8
limit                 27963 non-null int64
sex                   27963 non-null int64
education             27963 non-null int64
marriage              27963 non-null int64
age                   27963 non-null int64
RepayStatSep2005      27963 non-null int64
RepayStatAug2005      27963 non-null int64
RepayStatJul2005      27963 non-null int64
RepayStatJun2005      27963 non-null int64
RepayStatMay2005      27963 non-null int64
RepayStatApr2005      27963 non-null int64
AmtBillStmtSep2005    27963 non-null int64
AmtBillStmtAug2005    27963 non-null int64
AmtBillStmtJul2005    27963 non-null int64
AmtBillStmtJun2005    27963 non-null int64
AmtBillStmtMay2005    27963 non

### Select Features

In [96]:
#features
features = rdyData.iloc[:,0:28]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,18-24,25-34,35-44,45-54,55+,limit,sex,education,marriage,age,...,AmtBillStmtJul2005,AmtBillStmtJun2005,AmtBillStmtMay2005,AmtBillStmtApr2005,AmtPrevPaySep2005,AmtPrevPayAug2005,AmtPrevPayJul2005,AmtPrevPayJun2005,AmtPrevPayMay2005,AmtPrevPayApr2005
0,1,0,0,0,0,20000,2,2,1,24,...,689,0,0,0,0,689,0,0,0,0
1,0,1,0,0,0,120000,2,2,2,26,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,0,1,0,0,0,90000,2,2,2,34,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,0,0,1,0,0,50000,2,2,1,37,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,0,0,0,0,1,50000,1,2,1,57,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


### Select Dependent Variable

In [97]:
#dependent variable
depVar = rdyData['default']

### Split Train/Test data at a 70/30 ratio

In [98]:
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (19574, 28) 
 X_test (8389, 28) 
 y_train (19574,) 
 y_test (8389,)


### Build Models

In [99]:
modelRF = RandomForestClassifier( random_state=0)
modelSV = SVC(random_state=0)
modelKN = KNeighborsClassifier()
modelGB = GradientBoostingClassifier(random_state=0)

### Feature Selection

In [100]:
#Train Random Forest model
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [101]:
#Rank features by importance
importances = modelRF.feature_importances_
std = np.std([tree.feature_importances_ for tree in modelRF.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 10 (0.104926)
2. feature 11 (0.076693)
3. feature 16 (0.060433)
4. feature 9 (0.051619)
5. feature 5 (0.051435)
6. feature 19 (0.051402)
7. feature 17 (0.051196)
8. feature 18 (0.050530)
9. feature 23 (0.049882)
10. feature 21 (0.048723)
11. feature 22 (0.046532)
12. feature 20 (0.046145)
13. feature 26 (0.044401)
14. feature 24 (0.042923)
15. feature 27 (0.042493)
16. feature 25 (0.041529)
17. feature 12 (0.024267)
18. feature 13 (0.020538)
19. feature 7 (0.018575)
20. feature 15 (0.014791)
21. feature 14 (0.012618)
22. feature 8 (0.012389)
23. feature 6 (0.009612)
24. feature 2 (0.007791)
25. feature 1 (0.006960)
26. feature 3 (0.005174)
27. feature 0 (0.003704)
28. feature 4 (0.002721)


In [102]:
#Remove features with < 4% importance
cols = [0,1,2,3,4,12,13,7,15,14,8,6]
features.drop(features.columns[cols],axis=1,inplace=True)
features.head()

Unnamed: 0,limit,age,RepayStatSep2005,RepayStatAug2005,AmtBillStmtSep2005,AmtBillStmtAug2005,AmtBillStmtJul2005,AmtBillStmtJun2005,AmtBillStmtMay2005,AmtBillStmtApr2005,AmtPrevPaySep2005,AmtPrevPayAug2005,AmtPrevPayJul2005,AmtPrevPayJun2005,AmtPrevPayMay2005,AmtPrevPayApr2005
0,20000,24,2,2,3913,3102,689,0,0,0,0,689,0,0,0,0
1,120000,26,-1,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,90000,34,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,50000,37,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,50000,57,-1,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [103]:
#Reassign train and test data with features removed
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size=.30, random_state=0)
print(" X_train",X_train.shape,'\n',"X_test", X_test.shape,'\n',"y_train", y_train.shape,'\n',"y_test", y_test.shape)

 X_train (19574, 16) 
 X_test (8389, 16) 
 y_train (19574,) 
 y_test (8389,)


### Train and score all models with selected features

In [104]:
#Random Forest
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [105]:
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.81394636 0.81318008 0.81391784]


0.9799734341473383

In [106]:
#Support Vector Classification
modelSV.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [107]:
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.78344828 0.78421456 0.78372164]


0.9979564728721774

In [108]:
#K Nearest Neighbor
modelKN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [109]:
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.75739464 0.75632184 0.75337216]


0.8210381117809339

In [110]:
#Gradient Boosting Classifier
modelGB.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [111]:
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.82697318 0.82574713 0.82633354]


0.833963420864412

### Initial Predictions

In [112]:
#Random Forest
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.8102276791035881 
   Kappa:  0.32361678690716456


In [113]:
#Support Vector Classification
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.7884133984980332 
   Kappa:  0.015170910677263727


In [114]:
#K Nearest Neighbor
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.7601621170580523 
   Kappa:  0.12860619267087592


In [115]:
#Gradient Boosted Classification
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.8275122183812135 
   Kappa:  0.39798876019948737


### Tuning Models

In [116]:
modelRF.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [118]:
RF_grid = {"max_depth": [1, 5, None],
              "max_features": [1, 5, 10, 16],
              "min_samples_split": [2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [119]:
grid_search = GridSearchCV(modelRF, param_grid=RF_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 5, None], 'max_features': [1, 5, 10, 16], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [120]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [121]:
grid_search.best_score_

0.8268110759170328

In [34]:
modelSV.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 0,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [35]:
SV_grid = {"C": [1, 2, 3]}

In [36]:
grid_search = GridSearchCV(modelSV, param_grid=SV_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={'C': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
grid_search.best_params_

{'C': 1}

In [38]:
grid_search.best_score_

0.7837948298763666

In [122]:
modelKN.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [123]:
KN_grid = {"n_neighbors": [1, 10, 100],
              "weights": ['uniform', 'distance']}

In [124]:
grid_search = GridSearchCV(modelKN, param_grid=KN_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 10, 100], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [125]:
grid_search.best_params_

{'n_neighbors': 100, 'weights': 'distance'}

In [126]:
grid_search.best_score_

0.7852252988658425

In [127]:
modelGB.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': 0,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [128]:
GB_grid = {"loss": ['deviance', 'exponential'],
              "max_depth": [1, 2, 3, 4],
              "max_features": [1, 5, 10, 16]}

In [129]:
grid_search = GridSearchCV(modelGB, param_grid=GB_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['deviance', 'exponential'], 'max_depth': [1, 2, 3, 4], 'max_features': [1, 5, 10, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [130]:
grid_search.best_params_

{'loss': 'exponential', 'max_depth': 4, 'max_features': 16}

In [131]:
grid_search.best_score_

0.828088280371922

### Rebuild models with best tuning results and rescore

In [132]:
modelRF = RandomForestClassifier(bootstrap = False,
             criterion = 'entropy',
             max_features = 10,                    
             max_depth = 5,
             min_samples_leaf = 1,
             min_samples_split = 2,
             random_state = 0)
modelSV = SVC(C = 1,
             random_state = 0)
modelKN = KNeighborsClassifier(n_neighbors = 100,
             weights = 'distance')
modelGB = GradientBoostingClassifier(loss = 'exponential',
             max_depth = 4,
             max_features = 16,
             random_state = 0)

In [133]:
#Train Random Forest model
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.82789272 0.8251341  0.8274065 ]


0.8310513947072647

In [134]:
#Random Forest score
RFpred = modelRF.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,RFpred),"\n","  Kappa: ",cohen_kappa_score(y_test,RFpred))

Accuracy:  0.8275122183812135 
   Kappa:  0.3847721453119455


In [47]:
#ReTrain SVC model
modelSV.fit(X_train,y_train)
print(cross_val_score(modelSV, X_train, y_train))
modelSV.score(X_train,y_train)

[0.78344828 0.78421456 0.78372164]


0.9979564728721774

In [48]:
#SVC score
SVpred = modelSV.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,SVpred),"\n","  Kappa: ",cohen_kappa_score(y_test,SVpred))

Accuracy:  0.7884133984980332 
   Kappa:  0.015170910677263727


In [135]:
#ReTrain KNN model
modelKN.fit(X_train,y_train)
print(cross_val_score(modelKN, X_train, y_train))
modelKN.score(X_train,y_train)

[0.78390805 0.78544061 0.78632741]


0.9994891182180443

In [136]:
#KNN score
KNpred = modelKN.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,KNpred),"\n","  Kappa: ",cohen_kappa_score(y_test,KNpred))

Accuracy:  0.7903206580045298 
   Kappa:  0.07860244176190234


In [137]:
#ReTrain Gradient Boosted Classification
modelGB.fit(X_train,y_train)
print(cross_val_score(modelGB, X_train, y_train))
modelGB.score(X_train,y_train)

[0.82850575 0.82819923 0.82755978]


0.8387146214365996

In [138]:
#GB score
GBpred = modelGB.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,GBpred),"\n","  Kappa: ",cohen_kappa_score(y_test,GBpred))

Accuracy:  0.8263201811896531 
   Kappa:  0.3903357924028632


### Summary of Findings

Best model Gradient Boosted Classifier with:  
 • Accurracy = 82.6%  
 • Kappa     = 39.0%  

Most important prediction features:  
 • 3 most recent payment status  
 • 3 most recent amount previous paid  
 • Credit limit  