In [1]:
#The book is to predict whether a loan should be granted to a customer or not.

#Log Reg Caveats
#Great for quick, fast performance on any given problem.

#When to USE IT
#Binary Target Variable
#Only for clean, well behaved data.

#DO NOT USE IT
#Not for continuous variable
#Not for massive amounts of data.
#Should not be used on ill behaving data - Outliers, Skewed Features, Missing Values, Really Complex relationships
#Rarely be the best.

In [2]:
#Import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn

from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing

from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [3]:
%matplotlib inline
rcParams['figure.figsize'] = 5, 4 #plotting params
sb.set_style('whitegrid')

In [4]:
bank = pd.read_csv('/Users/kashs/Datasets/bank/bank-full.csv', sep = ';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
bank.info()

#bank.isnull().sum()
#bank.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
#Let's see how many converted vs how many that did not.
bank['y'].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [7]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# y-target variable

In [8]:
#Since the data is clean and this project is about Logistic Regression classifier, let's dummify the variables.
#Make new columns and append to DF.

#y target variable - mapping 1 for Yes and 0 for No.
y_mapping = {'yes':1,'no':0}
bank['y_binary'] = bank ['y'].map(y_mapping)
bank.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,y_binary
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,0


# Dummifying Feature : Job

In [9]:
from sklearn.preprocessing import OneHotEncoder #import OnehotEncoder which is a binary encoder
ohe = OneHotEncoder(categories='auto') #create the hotencoder object
job = bank.iloc[:,1:2].values  #select job column as array
job_fit = ohe.fit_transform(job).toarray() #fit the encoder to job isolated variable.
column_name = ohe.get_feature_names(['job']) #gets all the dummified column names
job_df = pd.DataFrame(job_fit, columns=column_name) #creates a dataframe with dummified numbers

#Concatenating the job_df to bank_df
bank_full = pd.concat([bank, job_df], axis = 1) 

In [10]:
#Check new dataframe with dummified job categories.
bank_full.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,35,management,married,tertiary,no,231,yes,no,unknown,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,58,retired,married,primary,no,121,yes,no,unknown,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,43,technician,single,secondary,no,593,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Dummifying Feature: Marital

In [11]:
ohe = OneHotEncoder(categories='auto') #create the hotencoder object

marital = bank.iloc[:,2:3].values  #select job column as array
marital_fit = ohe.fit_transform(marital).toarray() #fit the encoder to job isolated variable.

column_names_marital = ohe.get_feature_names(['marital']) #gets all the dummified column names
marital_df = pd.DataFrame(marital_fit, columns=column_names_marital) #creates a dataframe with dummified numbers

#Concatenating the job_df to bank_df
bank_full = pd.concat([bank_full, marital_df], axis = 1) 

In [12]:
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


# Dummifying Feature: Default

In [13]:
default_map = {'yes': 1,'no':0}
bank_full['credit_in_default']= bank_full['default'].map(default_map)
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,credit_in_default
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


# Dummifying Feature: Housing, Personal Loan

In [14]:
#Housing
housing_map = {'yes': 1,'no':0}
bank_full['housing_loan']= bank_full['housing'].map(default_map)

#Personal Loan
personal_map = {'yes': 1,'no':0}
bank_full['personal_loan']= bank_full['loan'].map(default_map)
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,credit_in_default,housing_loan,personal_loan
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,1
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0


# Dummifying: Previous Outcome

In [15]:
ohe = OneHotEncoder(categories='auto') #create the hotencoder object

p_outcome = bank.iloc[:,15:16].values  #select job column as array

p_outcome_fit = ohe.fit_transform(p_outcome).toarray() #fit the encoder to job isolated variable.

column_names_p_outcome = ohe.get_feature_names(['poutcome']) #gets all the dummified column names
p_outcome_df = pd.DataFrame(p_outcome_fit, columns=column_names_p_outcome) #creates a dataframe with dummified numbers

#Concatenating the job_df to bank_df
bank_full = pd.concat([bank_full, p_outcome_df], axis = 1) 

In [16]:
bank_full.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y', 'y_binary', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'credit_in_default', 'housing_loan', 'personal_loan',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [17]:
#Inspecting columns to be dropped - None seem likely to be dropped as they're all in good numbers except two:
#poutcome_other and poutcome_unknown.

#bank_full['job'].value_counts()
#bank_full['education'].value_counts()
bank_full['poutcome'].value_counts()


unknown    36959
failure     4901
other       1840
success     1511
Name: poutcome, dtype: int64

In [18]:
bank_full.drop(['poutcome_other','poutcome_unknown'], axis=1, inplace=True)

In [19]:
bank_full.iloc[:,16:38]

Unnamed: 0,y,y_binary,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,credit_in_default,housing_loan,personal_loan,poutcome_failure,poutcome_success
0,no,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,1,0,0.0,0.0
1,no,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,1,0,0.0,0.0
2,no,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,1,1,0.0,0.0
3,no,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,1,0,0.0,0.0
4,no,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,yes,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,0,0,0.0,0.0
45207,yes,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0,0,0,0.0,0.0
45208,yes,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,0,0,0.0,1.0
45209,no,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0,0,0,0.0,0.0


In [20]:
# #Checking distriubtion
# def DensityPlot(df,feature):
#     fig = plt.figure(figsize=(8,4))
#     sb.kdeplot(df.loc[df.loan_repaid == 0,feature], color='r', shade=True, label="didn't repay")
#     sb.kdeplot(df.loc[df.loan_repaid == 1,feature], color='b', shade=True, label="repaid",)
#     plt.xlabel(feature)
#     plt.ylabel('kernel density')
#     plt.show()

# Model Ready : Applying Logistic Regression

In [21]:
#Predictors and target variable.
predictors = bank_full.iloc[:,18:38].values #these are the predictors.
target = bank_full.iloc[:,17:18].values

In [22]:
#Split the dataset.

#Set aside 25% of the data for test. Keep 75% for training.
X_train, X_test, y_train, y_test = train_test_split (predictors, target, test_size=.75, random_state = 200)

# Check Parameters Before Building The Model

In [23]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [24]:
logreg = LogisticRegression()
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

cv = GridSearchCV(logreg, parameters, cv=5)
cv.fit(X_train, y_train.ravel())

print_results(cv)

#When C is close to 1, it's performing accuracy is at 89.3%

BEST PARAMS: {'C': 1}

0.881 (+/-0.0) for {'C': 0.001}
0.881 (+/-0.0) for {'C': 0.01}
0.893 (+/-0.004) for {'C': 0.1}
0.893 (+/-0.003) for {'C': 1}
0.893 (+/-0.003) for {'C': 10}
0.893 (+/-0.003) for {'C': 100}
0.893 (+/-0.003) for {'C': 1000}


In [25]:
cv.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Build and Fit the Model

In [26]:
#Build and fit the model.
logreg = LogisticRegression(C=1, solver='liblinear')
logreg.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred=logreg.predict(X_test)

In [28]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

#Actual predictions: 29602,672

array([[29602,   364],
       [ 3271,   672]])

In [29]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8928013211831667


In [30]:
test_user = np.array([0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1]).reshape(1,-1)
logreg.predict(test_user) #the user WILL BUY (1)

array([1])

In [31]:
print (classification_report (y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     29966
           1       0.65      0.17      0.27      3943

    accuracy                           0.89     33909
   macro avg       0.77      0.58      0.61     33909
weighted avg       0.87      0.89      0.86     33909



In [32]:
#Precision: Model's revelancy (87%) - of all the offers made, 87% users liked them.
#Recall: Model's completeness (89%) - of all the products users liked, 89% of those products were offered to them.

# Decision Trees

In [80]:
predictors_dt = bank_full.iloc[:,18:38] #these are the predictor features.
target_dt = bank_full.iloc[:,17:18]
target_dt = pd.get_dummies(target_dt)
#Split the dataset.

#Set aside 25% of the data for test. Keep 75% for training.
X_train, X_test, y_train, y_test = train_test_split (predictors_dt, target_dt, test_size=.75, random_state = 200)

In [81]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [85]:
# from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO 
# from IPython.display import Image 
# from pydot import graph_from_dot_data

# dot_data = StringIO()
# export_graphviz(dt, out_file=dot_data)
# (graph, ) = graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())


In [86]:
y_pred = dt.predict(X_test)

In [88]:
#confusion_matrix(np.array(y_test).argmax(axis=1), np.array(y_pred).argmax(axis=1))