In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
import sklearn

from itertools import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('bank.csv', sep = ';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [5]:
df_dummies = pd.DataFrame(pd.get_dummies(df['job']))

In [6]:
df_dummies.head()

Unnamed: 0,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0


In [7]:
def build_dummies(df, list_of_cate):
    
    dummies = pd.DataFrame()
    for col in list_of_cate:
        dummy = pd.get_dummies(df[col]).astype(np.int64).drop(pd.get_dummies(df[col]).columns[0], axis = 1)

        dummies = pd.concat([dummies, dummy], axis = 1)
    
    pro_df = df.drop(list_of_cate, axis = 1)
    new_df = pd.concat([pro_df, dummies], axis = 1)
    
    return new_df

def turn_binary(df, list_of_bi):
    
    new_df = df
    for col in list_of_bi:
        new_df[col] = (new_df[col] == 'yes')*1

#    print(new_df)
#    return new_df
        

In [8]:
cate_list = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
bi_list = ['default', 'housing', 'loan', 'y']
df_dum = build_dummies(df, cate_list)
turn_binary(df_dum, bi_list)

In [9]:
df_dum.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown,married,single,secondary,tertiary,unknown.1,telephone,unknown.2,aug,dec,feb,jan,jul,jun,mar,may,nov,oct,sep,other,success,unknown.3
0,30,0,1787,0,0,19,79,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,33,0,4789,1,1,11,220,1,339,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,35,0,1350,1,0,16,185,1,330,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,30,0,1476,1,1,3,199,4,-1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
4,59,0,0,1,0,5,226,1,-1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [10]:
df_dum.dtypes

age              int64
default          int32
balance          int64
housing          int32
loan             int32
day              int64
duration         int64
campaign         int64
pdays            int64
previous         int64
y                int32
blue-collar      int64
entrepreneur     int64
housemaid        int64
management       int64
retired          int64
self-employed    int64
services         int64
student          int64
technician       int64
unemployed       int64
unknown          int64
married          int64
single           int64
secondary        int64
tertiary         int64
unknown          int64
telephone        int64
unknown          int64
aug              int64
dec              int64
feb              int64
jan              int64
jul              int64
jun              int64
mar              int64
may              int64
nov              int64
oct              int64
sep              int64
other            int64
success          int64
unknown          int64
dtype: obje

In [11]:
def pd_std(df, list_std):
    for col in list_std:
        df[col] = ((df[col] - np.mean(df[col])) / np.std(df[col]))

In [12]:
numerical_list = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
pd_std(df_dum, numerical_list)

In [13]:
df_dum.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown,married,single,secondary,tertiary,unknown.1,telephone,unknown.2,aug,dec,feb,jan,jul,jun,mar,may,nov,oct,sep,other,success,unknown.3
0,-1.05627,0,0.121072,0,0,0.374052,-0.711861,-0.576829,-0.407218,-0.320413,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,-0.772583,0,1.118644,1,1,-0.596026,-0.169194,-0.576829,2.989044,2.041734,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,-0.583458,0,-0.024144,1,0,0.010273,-0.303898,-0.576829,2.899143,0.270124,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,-1.05627,0,0.017726,1,1,-1.566105,-0.250017,0.387967,-0.407218,-0.320413,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
4,1.686036,0,-0.472753,1,0,-1.323585,-0.146102,-0.576829,-0.407218,-0.320413,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [14]:
from sklearn.metrics import accuracy_score
train, test = train_test_split(range(df_dum.shape[0]), train_size = 0.7, random_state = 66)
mask = np.zeros(df_dum.shape[0])
mask[train] = 1
mask = (mask == 1)

In [15]:
def do_classify(clf, parameters, df, target, mask):

    X =  df.drop([target], axis = 1).values
    y = df[target].values
    
    clf.fit(X[mask], y[mask])
    
    ypred = clf.predict(X[~mask])
    cm = confusion_matrix(y[~mask], ypred)
    
    accuracy_train = accuracy_score(y[mask], clf.predict(X[mask]))
    accuracy_test = accuracy_score(y[~mask], ypred)

    # Recall is true positive rate (recall = TP / (OP + TP))
    recall = cm[1,1] / (cm[1,1]+cm[1,0])

    print('Confusion Matrix: \n', cm)
    print('=============================')
    print('Training Accuracy is: %0.2f%%' % (accuracy_train*100))
    print('Testing Accuracy is: %0.2f%%' % (accuracy_test*100))
    print('=============================')
    print('Recall is: %0.2f%%' % (recall*100))

In [16]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf = LogisticRegression()
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1172   25]
 [ 115   45]]
Training Accuracy is: 90.71%
Testing Accuracy is: 89.68%
Recall is: 28.12%


### SVM (Linear / RBF Kernel)

In [17]:
# Linear Kernel
clf = sklearn.svm.SVC(kernel='linear')
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1177   20]
 [ 135   25]]
Training Accuracy is: 89.60%
Testing Accuracy is: 88.58%
Recall is: 15.62%


In [18]:
# RBF Kernel
clf = sklearn.svm.SVC(kernel='rbf')
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1185   12]
 [ 137   23]]
Training Accuracy is: 90.27%
Testing Accuracy is: 89.02%
Recall is: 14.37%


In [19]:
# Try out C = 0.05, C = 100 for linear Kernel
clf = sklearn.svm.SVC(kernel = 'linear', C = 0.05)
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1177   20]
 [ 135   25]]
Training Accuracy is: 89.60%
Testing Accuracy is: 88.58%
Recall is: 15.62%


In [20]:
# Try out C = 0.05, C = 100 for linear Kernel
clf = sklearn.svm.SVC(kernel = 'linear', C = 50)
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1177   20]
 [ 135   25]]
Training Accuracy is: 89.60%
Testing Accuracy is: 88.58%
Recall is: 15.62%


In [23]:
# Try out C = 0.05 / gamma = 0.1
clf = sklearn.svm.SVC(kernel = 'rbf', C = 1, gamma = 0.1)
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1181   16]
 [ 129   31]]
Training Accuracy is: 92.32%
Testing Accuracy is: 89.31%
Recall is: 19.38%


In [22]:
# Try out C = 50 / gamma = 10
clf = sklearn.svm.SVC(kernel = 'rbf', C = 50, gamma = 10)
do_classify(clf, None, df_dum, 'y', mask)

Confusion Matrix: 
 [[1197    0]
 [ 160    0]]
Training Accuracy is: 100.00%
Testing Accuracy is: 88.21%
Recall is: 0.00%


## Conclusion

When we set our parameter C to a high value, we are less tolerant in terms of the error of the margin. This means we try our best to find a line which have least misclassification rate, which may cause overfitting. Similarily, having higher gamma value also will make the model prone to the trap of overfitting.