# Logistic Regression & KNN Examples

This notebook shows how to build a simple logistic regression binary classification method using the website purchases data, as well as how to use the K-nearest-neighbors algoithm for classification.

In [20]:
import pandas as pd
from sklearn import linear_model
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from data_util import *

In [21]:
# Read data and split, first into X and y and then into training and test.
data = pd.read_csv('./website-purchases.csv')
data.head()

data_x = data[list(data[1:])]
data_y = data['Buy']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, 
                                                    random_state=4)

### 1. Build and Evaluate a Logistic Regression Model

In [13]:
# Build the model
log_mod = linear_model.LogisticRegression()
log_mod.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
# Make predictions - both class labels and predicted probabilities
preds = log_mod.predict(x_test) # predicted class labels - 0 or 1
pred_probs = log_mod.predict_proba(x_test) # Prob. of being 0 and 1 for each test row.
print(pred_probs[:10])
pred_pos = pred_probs.transpose()[1] # Prob of each test row being a 1
pred_neg = pred_probs.transpose()[0] # Prob of each test row being a 0
print(pred_pos[:10])
print(pred_neg[:10])

[[0.06536885 0.93463115]
 [0.96586903 0.03413097]
 [0.91530187 0.08469813]
 [0.94162692 0.05837308]
 [0.91395401 0.08604599]
 [0.99434322 0.00565678]
 [0.97192527 0.02807473]
 [0.74295855 0.25704145]
 [0.94191101 0.05808899]
 [0.84294797 0.15705203]]
[0.93463115 0.03413097 0.08469813 0.05837308 0.08604599 0.00565678
 0.02807473 0.25704145 0.05808899 0.15705203]
[0.06536885 0.96586903 0.91530187 0.94162692 0.91395401 0.99434322
 0.97192527 0.74295855 0.94191101 0.84294797]


In [18]:
# Look at the results
pred_df = pd.DataFrame({'Actual':y_test, 'Predicted':preds,
                        'P(1)':pred_pos, 'P(0)':pred_neg})
pred_df.head(20)

Unnamed: 0,Actual,Predicted,P(1),P(0)
507,1,1,0.934631,0.065369
320,0,0,0.034131,0.965869
615,0,0,0.084698,0.915302
379,0,0,0.058373,0.941627
70,0,0,0.086046,0.913954
53,0,0,0.005657,0.994343
365,0,0,0.028075,0.971925
330,0,0,0.257041,0.742959
554,0,0,0.058089,0.941911
108,0,0,0.157052,0.842948


In [19]:
# Look at the error metrics and evaluate.
print_binary_classif_error_report(y_test, preds)

Accuracy: 0.995049504950495
Precison: 0.96875
Recall: 1.0
F1: 0.9841269841269841
ROC AUC: 0.9970760233918129
Confusion Matrix:
[[170   1]
 [  0  31]]


### 2. K-Nearest-Neighbors Classification

In [23]:
# Get some different values of k to try
ks = [1,3,5,7,9,11,13,15,17,19,21]

# For each value k, build a model and see how well it performed.
for k in ks:
    print('------------ EVALUATING MODEL: k =' + str(k) + ' -----------------')
    mod = neighbors.KNeighborsClassifier(n_neighbors=k)
    mod.fit(x_train, y_train)
    
    # Make predictions on the current model
    preds = mod.predict(x_test)
    
    # Look at the error metrics and evaluate.
    print_binary_classif_error_report(y_test, preds)

------------ EVALUATING MODEL: k =1 -----------------
Accuracy: 0.9108910891089109
Precison: 0.7096774193548387
Recall: 0.7096774193548387
F1: 0.7096774193548389
ROC AUC: 0.8285229202037352
Confusion Matrix:
[[162   9]
 [  9  22]]
------------ EVALUATING MODEL: k =3 -----------------
Accuracy: 0.905940594059406
Precison: 0.6578947368421053
Recall: 0.8064516129032258
F1: 0.7246376811594202
ROC AUC: 0.8652141105451802
Confusion Matrix:
[[158  13]
 [  6  25]]
------------ EVALUATING MODEL: k =5 -----------------
Accuracy: 0.9108910891089109
Precison: 0.6585365853658537
Recall: 0.8709677419354839
F1: 0.75
ROC AUC: 0.8945481984531222
Confusion Matrix:
[[157  14]
 [  4  27]]
------------ EVALUATING MODEL: k =7 -----------------
Accuracy: 0.9108910891089109
Precison: 0.6585365853658537
Recall: 0.8709677419354839
F1: 0.75
ROC AUC: 0.8945481984531222
Confusion Matrix:
[[157  14]
 [  4  27]]
------------ EVALUATING MODEL: k =9 -----------------
Accuracy: 0.9306930693069307
Precison: 0.7179487179