In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier

In [10]:
mow = pd.read_csv(r"Datasets\RidingMowers.csv")

In [11]:
dum_mow = pd.get_dummies(mow, drop_first = True)

In [12]:
X = dum_mow.drop("Response_Not Bought", axis = 1)
y = dum_mow["Response_Not Bought"]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, # explained
                                                    random_state = 2022,
                                                    train_size = 0.7
                                                   )


What does Stratify do? </b>

This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.


In [14]:
from sklearn.metrics import roc_curve, roc_auc_score

In [17]:
# Loop

acc = []
Ks = [x for x in range(1,16,2)]

for i in Ks:
    knn = KNeighborsClassifier( n_neighbors = i )
    knn.fit(X_train, y_train)
    y_pred_prob = knn.predict_proba(X_test)[:,1]
    acc.append(roc_auc_score(y_test, y_pred_prob))

In [16]:
i_max = np.argmax(acc)
best_k = Ks[i_max]
print("Best n_neighbors = ", best_k)
print(roc_auc_score(y_test, y_pred_prob))

Best n_neighbors =  3
0.977961432506887


# Using Log Loss

In [19]:
from sklearn.metrics import log_loss

In [22]:
# Log loss

acc = []
Ks = [x for x in range(1,16,2)]

for i in Ks:
    knn = KNeighborsClassifier( n_neighbors = i )
    knn.fit(X_train, y_train)
    y_pred_prob = knn.predict_proba(X_test)[:,1]
    acc.append(-log_loss(y_test, y_pred_prob))


In [23]:
i_max = np.argmax(acc)
best_k = Ks[i_max]
print("Best n_neighbors = ", best_k)
print(roc_auc_score(y_test, y_pred_prob))

Best n_neighbors =  15
0.977961432506887
