## Illustration of Binary Classification on Predicting Website Purchases

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Read in the data and split into training/test sets.
data = pd.read_csv('./data/website-purchases.csv')
data.head()

Unnamed: 0,Buy,Income,Is Female,Is Married,Has College,Is Professional,Is Retired,Unemployed,Residence Length,Dual Income,Minors,Own,House,White,English,Prev Child Mag,Prev Parent Mag
0,0,24000,1,0,1,1,0,0,26,0,0,0,1,0,0,0,0
1,1,75000,1,1,1,1,0,0,15,1,0,1,1,1,1,1,0
2,0,46000,1,1,0,0,0,0,36,1,1,1,1,1,1,0,0
3,1,70000,0,1,0,1,0,0,55,0,0,1,1,1,1,1,0
4,0,43000,1,0,0,0,0,0,27,0,0,0,0,1,1,0,1


In [2]:
# X is all non-buy columns, y is the buy column.
data_x = data[list(data)[1:]]
data_y = data['Buy']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4)


In [9]:
# Build the model
log_mod = linear_model.LogisticRegression()
log_mod.fit(x_train, y_train)

# Make predictions and look at the different error metrics.
preds = log_mod.predict(x_test)
pred_probs = log_mod.predict_proba(x_test)
prob_pos = pred_probs.transpose()[1]
prob_neg = pred_probs.transpose()[0]

results_df = pd.DataFrame({'Actual':y_test, 'Predicted':preds, 'P(Y=1)':prob_pos, 'P(Y=0)':prob_neg})
#results_df # Uncomment to see comparison of all the predictions/predicted class probabilities

In [10]:
print('Accuracy: ' + str(accuracy_score(y_test, preds)))
print('Precision: ' + str(precision_score(y_test, preds)))
print('Recall: ' + str(recall_score(y_test, preds)))
print('F1: ' + str(f1_score(y_test, preds)))
print('ROC AUC: ' + str(roc_auc_score(y_test, preds)))
print('Confusion Matrix:\n' + str(confusion_matrix(y_test, preds)))

Accuracy: 0.876237623762
Precision: 0.59375
Recall: 0.612903225806
F1: 0.603174603175
ROC AUC: 0.768439916997
Confusion Matrix:
[[158  13]
 [ 12  19]]
