## Part 3 - K-Nearest Neighbors Classifier

In [1]:
# Import the scaled data frame from Part 1 for reuse
%store -r scaled_df

# Import the training and testing sets from Part 2 for reuse
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [2]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train).score(X_test, y_test)

0.9666666666666667

In [4]:
params = {"n_neighbors": np.arange(1, 15),
          "weights": ["uniform", "distance"],
          "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
          "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], # Distance functions (i.e. different ways to evaluate distances between points)
          "leaf_size": np.linspace(10, 150, 15).astype(int)}

rand_search = RandomizedSearchCV(KNeighborsClassifier(), params, scoring = "accuracy",
                                 random_state = 1, cv = 5)
rand_search.fit(X_train, y_train)

rand_params = rand_search.best_params_
print(rand_params, "\n")
print("Train accuracy:", rand_search.best_score_)
preds = rand_search.predict(X_test)
print("Test accuracy:", accuracy_score(preds, y_test))

{'weights': 'uniform', 'n_neighbors': np.int64(7), 'metric': 'euclidean', 'leaf_size': np.int64(20), 'algorithm': 'brute'} 

Train accuracy: 0.9583333333333334
Test accuracy: 1.0


In [5]:
n_neighbors = np.arange(rand_params["n_neighbors"] - 3, rand_params["n_neighbors"] + 3)

leaf_size = np.arange(rand_params["leaf_size"] - 3, rand_params["leaf_size"] + 3)

params = {"n_neighbors": n_neighbors,
          "weights": [rand_params["weights"]],
          "algorithm": [rand_params["algorithm"]],
          "metric": [rand_params["metric"]],
          "leaf_size": leaf_size}

grid_search = GridSearchCV(KNeighborsClassifier(), params, scoring = "accuracy", cv = 5)
grid_search.fit(X_train, y_train)

grid_params = grid_search.best_params_
print(grid_params, "\n")
print("Train accuracy:", grid_search.best_score_)
preds = grid_search.predict(X_test)
print("Test accuracy:", accuracy_score(preds, y_test))

{'algorithm': 'brute', 'leaf_size': np.int64(17), 'metric': 'euclidean', 'n_neighbors': np.int64(7), 'weights': 'uniform'} 

Train accuracy: 0.9583333333333334
Test accuracy: 1.0


In [6]:
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

