# KNN Example using the Website Purchases dataset

In [1]:
import pandas as pd
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
data = pd.read_csv('./data/website-purchases.csv')
data.head()


Unnamed: 0,Buy,Income,Is Female,Is Married,Has College,Is Professional,Is Retired,Unemployed,Residence Length,Dual Income,Minors,Own,House,White,English,Prev Child Mag,Prev Parent Mag
0,0,24000,1,0,1,1,0,0,26,0,0,0,1,0,0,0,0
1,1,75000,1,1,1,1,0,0,15,1,0,1,1,1,1,1,0
2,0,46000,1,1,0,0,0,0,36,1,1,1,1,1,1,0,0
3,1,70000,0,1,0,1,0,0,55,0,0,1,1,1,1,1,0
4,0,43000,1,0,0,0,0,0,27,0,0,0,0,1,1,0,1


In [2]:
#get x/y data and split into train and test sets
data_x = data[list(data[1:])]
data_y = data['Buy']
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4)

In [3]:
#second component is a minmax scaler object
min_max_scaler = preprocessing.MinMaxScaler()

#run training data through the pipeline first
train_x_pp = min_max_scaler.fit_transform(x_train)
test_x_pp = min_max_scaler.transform(x_test)

  return self.partial_fit(X, y)


## Look at different values of K and see how it impacts the results

In [4]:
ks = [3, 5, 7, 9, 11, 13, 15, 17, 19]
for k in ks:
    #Create model and fit.
    mod = neighbors.KNeighborsClassifier(n_neighbors=k)
    mod.fit(train_x_pp, y_train)
    
    #Make Predictions and look at results.
    y_hat = mod.predict(test_x_pp)
    print('------------EVALUATING MODEL: k = ' + str(k) + '------------')
    print('Accuracy: ' + str(accuracy_score(y_test, y_hat)))
    print('Precision: ' + str(precision_score(y_test, y_hat)))
    print('Recall: ' + str(recall_score(y_test, y_hat)))
    print('F1: ' + str(f1_score(y_test, y_hat)))
    print('ROC AUC: ' + str(roc_auc_score(y_test, y_hat)))
    print('Confusion Matrix: \n' + str(confusion_matrix(y_test, y_hat)))

------------EVALUATING MODEL: k = 3------------
Accuracy: 0.9801980198019802
Precision: 0.9655172413793104
Recall: 0.9032258064516129
F1: 0.9333333333333333
ROC AUC: 0.9486889266176194
Confusion Matrix: 
[[170   1]
 [  3  28]]
------------EVALUATING MODEL: k = 5------------
Accuracy: 0.9851485148514851
Precision: 1.0
Recall: 0.9032258064516129
F1: 0.9491525423728813
ROC AUC: 0.9516129032258065
Confusion Matrix: 
[[171   0]
 [  3  28]]
------------EVALUATING MODEL: k = 7------------
Accuracy: 0.9801980198019802
Precision: 1.0
Recall: 0.8709677419354839
F1: 0.9310344827586207
ROC AUC: 0.935483870967742
Confusion Matrix: 
[[171   0]
 [  4  27]]
------------EVALUATING MODEL: k = 9------------
Accuracy: 0.9851485148514851
Precision: 1.0
Recall: 0.9032258064516129
F1: 0.9491525423728813
ROC AUC: 0.9516129032258065
Confusion Matrix: 
[[171   0]
 [  3  28]]
------------EVALUATING MODEL: k = 11------------
Accuracy: 0.9801980198019802
Precision: 0.9655172413793104
Recall: 0.9032258064516129
F1: