In [1]:
%matplotlib inline
import math
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification
from sklearn.model_selection import cross_val_score
from utilities import *

#import data and organize table for easier usage
DATA = Path('.').resolve().parent/'data'
bank_df = pd.read_csv(DATA/'UniversalBank.csv')
bank_df.drop(columns = ['ID', 'ZIP Code'], inplace = True)
columns = list(bank_df.columns)
columns.remove('Personal Loan')
columns.append('Personal Loan')
bank_df = bank_df[columns]
bank_df.columns = [s.strip().replace(' ', '_') for s in bank_df.columns]
train_df = bank_df.sample(frac=0.6, random_state=12345)
valid_df = bank_df.drop(train_df.index)
train_df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard,Personal_Loan
3183,44,17,12,3,0.67,2,0,0,0,1,0,0
1071,39,14,61,3,0.5,3,137,0,0,1,0,0
2640,39,13,81,2,2.8,1,0,0,0,1,0,0
2282,38,14,90,2,2.7,1,0,0,0,1,1,0
1595,56,26,38,3,1.0,3,110,1,0,1,0,0


In [2]:
newCustomer = pd.DataFrame([
    [40, 10, 84, 2, 2, 2, 0, 0, 0, 1, 1],[25, 6, 50, 1, 1.8, 1, 1, 0, 0, 1, 1],
    [59, 30, 120, 3, 1.9, 3, 0, 0, 1, 1, 0]], 
    columns=['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 
                                'Securities_Account', 'CD_Account', 'Online', 'CreditCard'])
newCustomer

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard
0,40,10,84,2,2.0,2,0,0,0,1,1
1,25,6,50,1,1.8,1,1,0,0,1,1
2,59,30,120,3,1.9,3,0,0,1,1,0


# Fit models to the data for (1) k-nearest neighbors with k = 3,

In [3]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_df.iloc[:, :-1]) #preprocess data except response column
trainNorm_df = train_df.copy()
trainNorm_df.iloc[:,:-1] = scaler.transform(train_df.iloc[:,:-1])
validNorm_df = valid_df.copy()
validNorm_df.iloc[:,:-1] = scaler.transform(valid_df.iloc[:,:-1])
newCustomerNorm_df = newCustomer.copy()
#fit using k=3 and predict, create confusion matrix
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(trainNorm_df.iloc[:,:-1], trainNorm_df.loc[:,'Personal_Loan'])
knnPred = knn.predict(validNorm_df.iloc[:,:-1])
knnPredProb = knn.predict_proba(validNorm_df.iloc[:,:-1])
newCustomer['predict_knn'] = knn.predict(newCustomerNorm_df.iloc[:,:])
printConfusionMatrix(validNorm_df.iloc[:,-1:], knnPred,
                     class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])

Confusion Matrix (Accuracy 0.9660)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1792                8
Personal_Loan(1)               60              140


# Fit models to the data for (2) Naive Bayes

In [4]:
# We convert several of the columns to categorical data
bank_cat = bank_df.copy()
for column in ('Personal_Loan', 'Family', 'Online', 'CreditCard', 'Education', 
               'Securities_Account', 'CD_Account'):
    bank_cat[column] = bank_df[column].astype('category')
# The remaining columns (Age, Expereince, Income, Mortgae and CCAvg) will be binned
bank_cat['Age'] = pd.cut(bank_df['Age'], 5, labels=range(1, 6)).astype('category')
bank_cat['Experience'] = pd.cut(bank_df['Experience'], 10, labels=range(1, 11)).astype('category')
bank_cat['Income'] = pd.cut(bank_df['Income'], 5, labels=range(1, 6)).astype('category')
bank_cat['CCAvg'] = pd.cut(bank_df['CCAvg'], 6, labels=range(1, 7)).astype('category')
bank_cat['Mortgage'] = pd.cut(bank_df['Mortgage'], 10, labels=range(1, 11)).astype('category')
newColumns = []
classes = []
for column in bank_cat.columns:
    le = preprocessing.LabelEncoder()
    bank_cat.loc[:, column] = le.fit_transform(bank_cat[column])
    if column == 'Personal_Loan':
        newColumns.append(column)
        classes = le.classes_
    else:
        newColumns.extend('{}_{}'.format(column, c) for c in le.classes_)

# one-hot-encoding of the training data
indVariables = bank_cat.drop(columns=['Personal_Loan'])
depVariables = bank_cat['Personal_Loan']
enc = preprocessing.OneHotEncoder()
bank_cat = pd.concat([pd.DataFrame(enc.fit_transform(indVariables).toarray()), depVariables], axis=1)
bank_cat.columns = newColumns

trainNB = bank_cat.iloc[train_df.index]
validNB = bank_cat.iloc[valid_df.index]

loan_nb = MultinomialNB(alpha=0.01)
loan_nb.fit(trainNB.drop(columns=['Personal_Loan']), trainNB['Personal_Loan'])
nbPred = loan_nb.predict(validNB.drop(columns=['Personal_Loan']))
nbPredProb = loan_nb.predict_proba(validNB.drop(columns=['Personal_Loan']))
printConfusionMatrix(validNB['Personal_Loan'], nbPred, class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])

Confusion Matrix (Accuracy 0.9300)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1732               68
Personal_Loan(1)               72              128


# Fit models to the data for (3) classification trees

In [5]:
trainTree = train_df.copy()
validTree = valid_df.copy()
trainTree.Personal_Loan = train_df.Personal_Loan.astype('category')
validTree.Personal_Loan = valid_df.Personal_Loan.astype('category')
indVars = trainTree.drop(columns=['Personal_Loan'])
depVars = trainTree['Personal_Loan']
results = []
for max_depth in range(1,31):
    treeClassifier = DecisionTreeClassifier(max_depth=max_depth)
    scores = cross_val_score(treeClassifier, indVars, depVars, cv=5)
    treeClassifier.fit(indVars, depVars)
    results.append({'depth': max_depth, 
                    'meanAccuracy': scores.mean(), 'stdAcc': scores.std(),
                    'validAccuracy': accuracy_score(validTree['Personal_Loan'], 
                                                    treeClassifier.predict(validTree.drop(columns=['Personal_Loan'])))})
results_df = pd.DataFrame(results)
print(results_df.head(10))
#the best depth = 5
treeClassifier = DecisionTreeClassifier(max_depth=5)
treeClassifier.fit(indVars, depVars)
treePred = treeClassifier.predict(validTree.drop(columns=['Personal_Loan']))
treePredProb = treeClassifier.predict_proba(validTree.drop(columns=['Personal_Loan']))
newCustomer['predict_tree'] = treeClassifier.predict(newCustomer.iloc[:,:11])
printConfusionMatrix(validTree['Personal_Loan'], treePred, class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])
newCustomer

   depth  meanAccuracy    stdAcc  validAccuracy
0      1      0.906667  0.000000         0.9000
1      2      0.963000  0.008393         0.9620
2      3      0.983333  0.005774         0.9795
3      4      0.983667  0.004397         0.9800
4      5      0.983000  0.004876         0.9835
5      6      0.982667  0.004028         0.9825
6      7      0.981667  0.003162         0.9810
7      8      0.980333  0.004137         0.9790
8      9      0.980667  0.001333         0.9820
9     10      0.980000  0.003944         0.9805
Confusion Matrix (Accuracy 0.9840)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1788               12
Personal_Loan(1)               20              180


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard,predict_knn,predict_tree
0,40,10,84,2,2.0,2,0,0,0,1,1,1,0
1,25,6,50,1,1.8,1,1,0,0,1,1,1,0
2,59,30,120,3,1.9,3,0,0,1,1,0,1,1


# actual outcome, predicted outcome, and probability of being a "1" for each of the three models. 

In [6]:
output_df = pd.concat([pd.DataFrame(valid_df.Personal_Loan.values), pd.DataFrame(knnPred), pd.DataFrame(nbPred), 
                       pd.DataFrame(treePred), pd.DataFrame(knnPredProb[:,1]), pd.DataFrame(nbPredProb[:,1]), 
                       pd.DataFrame(treePredProb[:,1])], axis=1)
output_df.columns = ['Loan_true', 'Loan_knn', 'Loan_nb', 'Loan_tree', 'Loan_knnprob', 'Loan_nbprob', 'Loan_treeprob']
output_df.head(10)

Unnamed: 0,Loan_true,Loan_knn,Loan_nb,Loan_tree,Loan_knnprob,Loan_nbprob,Loan_treeprob
0,0,0,0,0,0.0,4e-06,0.0
1,0,0,0,0,0.0,2e-06,0.0
2,0,0,0,0,0.0,2e-06,0.0
3,0,0,0,0,0.0,0.275161,0.0
4,0,0,0,0,0.0,5e-06,0.0
5,0,0,1,1,0.333333,0.523489,1.0
6,0,0,0,0,0.0,7e-06,0.0
7,0,0,0,0,0.0,0.09256,0.0
8,0,0,0,0,0.0,3e-06,0.0
9,1,1,1,1,1.0,0.652646,1.0


#  (1) a majority vote of predicted outcomes, and (2) the average of the predicted probabilities.

In [7]:
col_list= list(['Loan_knn', 'Loan_nb', 'Loan_tree'])
output_df['sumVote'] = output_df[col_list].sum(axis = 1)
output_df['Loan_vote'] = [1 if x >= 2 else 0 for x in output_df['sumVote']]
output_df.drop(columns = ['sumVote'], inplace = True)
col_list= list(['Loan_knnprob', 'Loan_nbprob', 'Loan_treeprob'])
output_df['avgProb'] = output_df[col_list].sum(axis = 1)/3
output_df['Loan_avgprob'] = [1 if x > 0.5 else 0 for x in output_df['avgProb']]
output_df.drop(columns = ['avgProb'], inplace = True)
output_df.head(10)

Unnamed: 0,Loan_true,Loan_knn,Loan_nb,Loan_tree,Loan_knnprob,Loan_nbprob,Loan_treeprob,Loan_vote,Loan_avgprob
0,0,0,0,0,0.0,4e-06,0.0,0,0
1,0,0,0,0,0.0,2e-06,0.0,0,0
2,0,0,0,0,0.0,2e-06,0.0,0,0
3,0,0,0,0,0.0,0.275161,0.0,0,0
4,0,0,0,0,0.0,5e-06,0.0,0,0
5,0,0,1,1,0.333333,0.523489,1.0,1,1
6,0,0,0,0,0.0,7e-06,0.0,0,0
7,0,0,0,0,0.0,0.09256,0.0,0,0
8,0,0,0,0,0.0,3e-06,0.0,0,0
9,1,1,1,1,1.0,0.652646,1.0,1,1


# confusion matrix for ensemble methods and report the overall accuracy

In [8]:
printConfusionMatrix(output_df['Loan_true'], output_df['Loan_vote'], class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])
printConfusionMatrix(output_df['Loan_true'], output_df['Loan_avgprob'], class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])

Confusion Matrix (Accuracy 0.9740)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1789               11
Personal_Loan(1)               41              159
Confusion Matrix (Accuracy 0.9780)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1792                8
Personal_Loan(1)               36              164


# the error rates for the three individual methods and the two ensemble methods

In [9]:
error_knn = 1 - classification.accuracy_score(output_df['Loan_true'], output_df['Loan_knn'])
error_nb = 1 - classification.accuracy_score(output_df['Loan_true'], output_df['Loan_nb'])
error_tree = 1 - classification.accuracy_score(output_df['Loan_true'], output_df['Loan_tree'])
error_vote = 1 - classification.accuracy_score(output_df['Loan_true'], output_df['Loan_vote'])
error_avgprob = 1 - classification.accuracy_score(output_df['Loan_true'], output_df['Loan_avgprob'])
print("Error Rate for KNN model = %0.4f" %error_knn)
print("Error Rate for Naive Bayes model = %0.4f" %error_nb)
print("Error Rate for Classification Tree model = %0.4f" %error_tree)
print("Error Rate for Majority Vote model = %0.4f" %error_vote)
print("Error Rate for average probability model = %0.4f" %error_avgprob)

Error Rate for KNN model = 0.0340
Error Rate for Naive Bayes model = 0.0700
Error Rate for Classification Tree model = 0.0160
Error Rate for Majority Vote model = 0.0260
Error Rate for average probability model = 0.0220


#  Bagging and Boosted Trees, accuracy can see boost tree has very good performance

In [14]:
bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=5), 
                            max_samples=0.5, max_features=0.5)
bagging.fit(train_df.drop(columns=['Personal_Loan']), train_df['Personal_Loan'])
baggingPred = bagging.predict(valid_df.drop(columns=['Personal_Loan']))
newCustomer['predict_bagging'] = bagging.predict(newCustomer.iloc[:,:11])
printConfusionMatrix(valid_df['Personal_Loan'], baggingPred, class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])
newCustomer

Confusion Matrix (Accuracy 0.9515)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1799                1
Personal_Loan(1)               96              104


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard,predict_knn,predict_tree,predict_bagging,predict_boost
0,40,10,84,2,2.0,2,0,0,0,1,1,1,0,0,0
1,25,6,50,1,1.8,1,1,0,0,1,1,1,0,0,0
2,59,30,120,3,1.9,3,0,0,1,1,0,1,1,1,1


In [11]:
boost = AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier(max_depth=5))
boost.fit(train_df.drop(columns=['Personal_Loan']), train_df['Personal_Loan'])
boostPred = boost.predict(valid_df.drop(columns=['Personal_Loan']))
newCustomer['predict_boost'] = boost.predict(newCustomer.iloc[:,:11])
printConfusionMatrix(valid_df['Personal_Loan'], boostPred, class_names=['Personal_Loan(0)', 'Personal_Loan(1)'])
newCustomer

Confusion Matrix (Accuracy 0.9860)

                Reference
      Prediction Personal_Loan(0) Personal_Loan(1)
Personal_Loan(0)             1797                3
Personal_Loan(1)               25              175


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities_Account,CD_Account,Online,CreditCard,predict_knn,predict_tree,predict_bagging,predict_boost
0,40,10,84,2,2.0,2,0,0,0,1,1,1,0,0,0
1,25,6,50,1,1.8,1,1,0,0,1,1,1,0,0,0
2,59,30,120,3,1.9,3,0,0,1,1,0,1,1,1,1
