# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

# Get Example Data

In [None]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide
import pandas as pd
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv", encoding="latin_1")
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [None]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']
data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [None]:
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X.shape)
X_train.shape

(164, 4)


(123, 4)

### Train model with k=5

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print(knn.score(X_test, y_test))

0.8048780487804879


### Train model with k=10

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print(f'accuracy: {str(knn.score(X_test, y_test))}')

accuracy: 0.8780487804878049


In [None]:
y_pred = knn.predict(X_test) # y_pred includes your predictions
y_pred # view predictions for test data

array([0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

## Using Cross-Validation for model evaluation

In [None]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold

from sklearn.model_selection import KFold
import numpy as np
from statistics import mean

# Set up function parameters for different cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

print("KFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=kfold))))

print("\nStratifiedKFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=skfold))))

print("\nRepeatedKFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=rkf))))

KFold:
0.764

StratifiedKFold:
0.7646666666666666

RepeatedKFold:
0.7713333333333333


## Tuning models with grid search

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors'
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))

best mean cross-validation score: 0.815
best parameters: {'n_neighbors': 9}
test-set score: 0.780


In [None]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002023,0.000235,0.00334,0.000219,1,{'n_neighbors': 1},0.615385,0.538462,0.846154,0.75,0.833333,0.833333,0.833333,0.666667,0.666667,0.75,0.733333,0.10201,7
1,0.001843,3.6e-05,0.003177,9.1e-05,3,{'n_neighbors': 3},0.846154,0.384615,0.846154,0.75,0.916667,0.833333,0.75,0.916667,0.416667,0.916667,0.757692,0.187828,6
2,0.001993,0.000412,0.003323,0.000356,5,{'n_neighbors': 5},0.769231,0.461538,0.846154,0.833333,0.916667,0.833333,0.833333,0.916667,0.583333,0.916667,0.791026,0.144201,3
3,0.001837,5.4e-05,0.003099,3.8e-05,7,{'n_neighbors': 7},0.692308,0.615385,0.769231,0.833333,0.833333,0.833333,0.833333,1.0,0.583333,0.916667,0.791026,0.12281,3
4,0.002428,0.000483,0.004128,0.001017,9,{'n_neighbors': 9},0.769231,0.615385,0.846154,0.833333,0.833333,0.833333,0.916667,0.916667,0.666667,0.916667,0.814744,0.098453,1
5,0.001946,0.00021,0.003394,0.000242,11,{'n_neighbors': 11},0.692308,0.615385,0.846154,0.833333,0.833333,0.833333,0.833333,0.916667,0.583333,0.916667,0.790385,0.112026,5
6,0.001865,0.000196,0.003142,0.000101,13,{'n_neighbors': 13},0.769231,0.615385,0.769231,0.833333,0.833333,0.833333,0.833333,0.916667,0.75,0.916667,0.807051,0.083454,2


### Using GridSearchCV, cross_val_score, and .score()

*  `.score()`
   *   `.score()` is the default method in sklearn for scoring a model *that has previously been fit to training data with `.fit()`.*
   *   You give it the `X` and `y` data.   It produces predictions for `y` (e.g. `y_pred`) based on plugging the `X` data into the fit model. Scores are generated by comparing the actual `y` to `y_pred`.
*  `cross_val_score()`
   *   You give it the **model**, **X_train**, **y_train**, and **number of cross-validation folds** (e.g. `cv=10`).
   *   It returns a list of scores with length `cv` from the cross-validation evaluation, *but it does not update the model with the data fit automatically*. It only returns the scores.
   *   `cross_val_score()` is often used when comparing different models or tuning hyperparamters. Once the optimal parameters have been selected, the model should be fit to the full training set with `model.fit(X_train, y_train)` and evaluated on the full test set with `model.score(X_test, y_test)`.
*  `GridSearchCV()`
   *   You give it the in the **model**, **parameter grid**, and **number of cross-validation folds** (e.g. `cv=10`).
   *   It returns a new object (e.g. '`grid`') that needs to then be fit to the training data with `.fit()`. This object will automatically find the optimal parameters during fitting by searching the parameter grid and utilizing cross-validation on the training data.
      *   `grid.best_params_` returns the best parameter combination from the grid.
     *   `grid.best_score_` returns the average cross-validation score achieved during fitting with the best parameters.
   *   A final evaluation is done with `grid.score(X_test, y_test)`. This is done using the *entire* test set rather than averaging the score from different cross-validation folds.
