In [8]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt


data=pd.read_csv('bank.csv', header=0, delimiter=';') 
data.head()
data.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1165,34,blue-collar,married,secondary,no,1897,yes,yes,cellular,19,nov,441,1,-1,0,unknown,no
1891,32,management,single,tertiary,no,656,yes,yes,cellular,20,apr,148,2,-1,0,unknown,no
1175,60,housemaid,married,primary,no,517,no,no,unknown,12,jun,1178,3,-1,0,unknown,no
398,38,blue-collar,married,primary,no,168,yes,yes,telephone,13,may,212,3,-1,0,unknown,no
2345,26,services,single,secondary,no,512,yes,yes,cellular,30,jan,206,5,239,3,failure,no


In [7]:
# Dealing with categorical data
"""First way is called Label Encoding: convert each value in a column to a number between 0 and (number of levels - 1)."""
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['education'] = encoder.fit_transform(data['education'])
data['job'] = encoder.fit_transform(data['job'])
data.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,education_code
4450,53,1,married,0,no,3316,yes,no,unknown,23,may,331,6,-1,0,unknown,no,0
1615,34,4,married,2,no,417,yes,no,cellular,18,nov,138,1,-1,0,unknown,no,2
1503,56,5,married,1,no,-113,yes,yes,cellular,12,aug,614,2,-1,0,unknown,no,1
4328,49,0,married,1,no,3608,yes,no,unknown,13,nov,231,1,-1,0,unknown,yes,1
3752,47,7,married,1,no,0,yes,no,telephone,9,jul,250,1,-1,0,unknown,no,1


In [9]:
"""Second method is called One-Hot encoding, creating 0-1 dummy variable for each """

copy_data = data.copy()
copy_data = pd.get_dummies(copy_data, columns=['education'], prefix = ['education'])
copy_data = pd.get_dummies(copy_data, columns=['job'], prefix = ['job'])
copy_data.head()



Unnamed: 0,age,marital,default,balance,housing,loan,contact,day,month,duration,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,30,married,no,1787,no,no,cellular,19,oct,79,...,0,0,0,0,0,0,0,0,1,0
1,33,married,no,4789,yes,yes,cellular,11,may,220,...,0,0,0,0,0,1,0,0,0,0
2,35,single,no,1350,yes,no,cellular,16,apr,185,...,0,0,1,0,0,0,0,0,0,0
3,30,married,no,1476,yes,yes,unknown,3,jun,199,...,0,0,1,0,0,0,0,0,0,0
4,59,married,no,0,yes,no,unknown,5,may,226,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y=data.iloc[:,-1]
#y=data['y']
n=data['education'].value_counts().count()+ data['job'].value_counts().count()
print(n)
X=copy_data.iloc[:,-n:]  # Just picking the dummy variables, the last 16 columns
X = pd.concat([X, copy_data.iloc[:,[3,9]]], axis =1)
X.sample(10)
#y.head()

16


Unnamed: 0,education_primary,education_secondary,education_tertiary,education_unknown,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,balance,duration
1584,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3516,447
2509,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,232
2872,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,308,114
3334,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,23,344
1387,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,553,113
2098,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,32,476
96,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,5426,149
1293,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,743,36
4073,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,176,8
3004,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,7,224


In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))


Accuracy of logistic regression classifier on test set: 0.89




In [49]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[783  16]
 [ 90  16]]


In [50]:
logreg.coef_

array([[-6.68721661e-01, -6.66232908e-01, -3.56380177e-01,
        -8.25536548e-01, -1.33865026e-01, -8.65143462e-01,
        -6.31104649e-01, -3.22024071e-01, -3.76764012e-01,
         6.64689115e-01, -4.83376063e-01, -4.84201033e-01,
         4.40546346e-01, -2.34847875e-01, -3.85871772e-01,
         2.95091210e-01,  3.12490725e-06,  3.97539034e-03]])

In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
# macro average and weighted average


             precision    recall  f1-score   support

         no       0.90      0.98      0.94       799
        yes       0.50      0.15      0.23       106

avg / total       0.85      0.88      0.85       905



In [52]:
# To see whether or not the classes are balanced/imbalanced: In this case is highly impbalanced
data['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64