
# Machine Learning Based Recommendation Systems
## Evaluating Recommendation Systems

In [3]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

This bank marketing dataset is open-sourced and available for download at the UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/Bank+Marketing#).

It was originally created by: [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

In [22]:
bank_full = pd.read_csv('bank_full_w_dummy_vars.csv')
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_unknown,job_retired,job_services,job_self_employed,job_unemployed,job_maid,job_student,married,single,divorced
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,1,1
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,0,1,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,1,0,0,0,0,0,0,0,1,1


In [23]:
bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 37 columns):
age                             45211 non-null int64
job                             45211 non-null object
marital                         45211 non-null object
education                       45211 non-null object
default                         45211 non-null object
balance                         45211 non-null int64
housing                         45211 non-null object
loan                            45211 non-null object
contact                         45211 non-null object
day                             45211 non-null int64
month                           45211 non-null object
duration                        45211 non-null int64
campaign                        45211 non-null int64
pdays                           45211 non-null int64
previous                        45211 non-null int64
poutcome                        45211 non-null object
y                               45

In [33]:
in_features = bank_full.iloc[:,[18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36]]
in_features.head()

Unnamed: 0,housing_loan,credit_in_default,personal_loans,prev_failed_to_subscribe,prev_subscribed,job_management,job_tech,job_entrepreneur,job_bluecollar,job_unknown,job_retired,job_services,job_self_employed,job_unemployed,job_maid,job_student,married,single,divorced
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
2,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1


In [35]:
X = in_features.values
X

array([[1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 1, 1],
       [1, 0, 1, ..., 1, 0, 0],
       ..., 
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [54]:
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_unknown,job_retired,job_services,job_self_employed,job_unemployed,job_maid,job_student,married,single,divorced
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,1,1
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,0,1,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,1,0,0,0,0,0,0,0,1,1


In [60]:
# wtf_loc_bug at high index
#x_features = ['housing_loan', 'credit_in_default', 'personal_loans', 'prev_failed_to_subscribe'
#              , 'prev_subscribed', 'job_management', 'job_tech', 'job_entrepreneur', 'job_bluecollar'
#              , 'job_unknown', 'job_retired', 'job_services', 'job_self_employed', 'job_unemployed'
#              , 'job_maid', 'job_student', 'married', 'single', 'divorced']
x_features = ['job', 'marital','balance', 'job_retired', 'divorced']
wtf_loc_bug = bank_full.loc[:, x_features]
wtf_loc_bug.head()

Unnamed: 0,job,marital,balance,job_retired,divorced
0,management,married,2143,,
1,technician,single,29,,
2,entrepreneur,married,2,,
3,blue-collar,married,1506,,
4,unknown,single,1,,


In [61]:
y = bank_full.iloc[:,17].values
y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [62]:
LogReg = LogisticRegression()
LogReg.fit(X, y)
y_pred = LogReg.predict(X)

In [65]:
# evaluate score using classification_report()
print(classification_report(y, y_pred))

             precision    recall  f1-score   support

          0       0.90      0.99      0.94     39922
          1       0.67      0.17      0.27      5289

avg / total       0.87      0.89      0.86     45211



Moreinfo: http://onlineconfusionmatrix.com/  
Measure	Value	Derivations  
Sensitivity		TPR = TP / (TP + FN)  
Specificity		SPC = TN / (FP + TN)  
Precision		PPV = TP / (TP + FP)  
Negative Predictive Value		NPV = TN / (TN + FN)  
False Positive Rate		FPR = FP / (FP + TN)  
False Discovery Rate		FDR = FP / (FP + TP)  
False Negative Rate		FNR = FN / (FN + TP)  
Accuracy		ACC = (TP + TN) / (P + N)  
F1 Score		F1 = 2TP / (2TP + FP + FN)  
Matthews Correlation Coefficient		TP*TN - FP*FN / sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))  