## Part 3 - K-Nearest Neighbors Classifier

In [7]:
# Import the scaled data frame from Part 1 for reuse
%store -r scaled_df

# Import the training and testing sets from Part 2 for reuse
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [8]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

### Build the classifier

In [9]:
# Create the classifier
clf = KNeighborsClassifier()
# Train the classifier and then score it with the test set
clf.fit(X_train, y_train).score(X_test, y_test)

1.0

### Randomized search predictions and scoring

In [10]:
# We're now trying to find out what'd be the optimal solution (i.e. optimal set of parameters) for the training

# Set hyperparameters to fine-tune the k-nearest neighbors
params = {"n_neighbors": np.arange(1, 15),
          "weights": ["uniform", "distance"],
          "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
          "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], # Distance functions (i.e. different ways to evaluate distances between points)
          "leaf_size": np.linspace(10, 150, 15).astype(int)}

# Use the randomized search
# It'll randomly take a selection of the hyperparametes and try to find which ones that work out the best
# scoring = "accuracy" defines that the search will select the best parameters based on accuracy
# cv = 5 stands for 5-fold cross-validation
rand_search = RandomizedSearchCV(KNeighborsClassifier(), params, scoring = "accuracy", cv = 5)
# Train the search model
rand_search.fit(X_train, y_train)

# The most optimal parameters the model found
rand_params = rand_search.best_params_
# Print the optimal parameters the model found
print(rand_params, "\n")

# Print the best training accuracy, i.e., score the model found
print("Train accuracy:", rand_search.best_score_)
# Predict with the test data (will be iris species, 0, 1, or 2)
preds = rand_search.predict(X_test)
# Print the testing accuracy
print("Test accuracy:", accuracy_score(preds, y_test))

{'weights': 'uniform', 'n_neighbors': np.int64(12), 'metric': 'euclidean', 'leaf_size': np.int64(140), 'algorithm': 'kd_tree'} 

Train accuracy: 0.9583333333333334
Test accuracy: 1.0


### Grid search predictions and scoring

In [11]:
# Opposed to randomized search,
# grid search will try out every single combination of hyperparameters it's given

# n_neighbors and leaf_samples were the only numeric parameters
# For those parameters, let's take the values the randomized search gave us,
# and make the parameters new ranges from 3 below to 3 over the values
# The other parameters shall have the values the randomized search came back with

# Define a new range for the number of neighbors
n_neighbors = np.arange(rand_params["n_neighbors"] - 3, rand_params["n_neighbors"] + 3)
# Define a new range for the leaf size
leaf_size = np.arange(rand_params["leaf_size"] - 3, rand_params["leaf_size"] + 3)

# Use the weights, algorithm, and metric values the randomized search ended up with
params = {"n_neighbors": n_neighbors,
          "weights": [rand_params["weights"]],
          "algorithm": [rand_params["algorithm"]],
          "metric": [rand_params["metric"]],
          "leaf_size": leaf_size}

# Note that this time GridSearchCV doesn't take random_state as a parameter,
# because grid search goes through every hyperparameter
grid_search = GridSearchCV(KNeighborsClassifier(), params, scoring = "accuracy", cv = 5)
grid_search.fit(X_train, y_train)

grid_params = grid_search.best_params_
print(grid_params, "\n")

print("Train accuracy:", grid_search.best_score_)
preds = grid_search.predict(X_test)
print("Test accuracy:", accuracy_score(preds, y_test))

{'algorithm': 'kd_tree', 'leaf_size': np.int64(137), 'metric': 'euclidean', 'n_neighbors': np.int64(14), 'weights': 'uniform'} 

Train accuracy: 0.9666666666666668
Test accuracy: 0.9666666666666667


In [12]:
# How well the classification performed?
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.80      0.89         5
           2       0.92      1.00      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.93      0.95        30
weighted avg       0.97      0.97      0.97        30

