**Load dataset**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from matplotlib import pyplot as plt
%matplotlib inline

wine = np.loadtxt("./winequality-white2.csv", delimiter=",") #required deleting the headers for this to work

print (wine.shape)

# store feature matrix in "X"
X = wine[:,0:10]

# store response vector in "y"
y = wine[:,11]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

**MULTIVARIATE ANALYSIS / GRID SEARCH**


In [None]:
# define the parameter values that should be searched
k_range = list(range(1, 400))
weight_options = ['uniform', 'distance']

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)

In [68]:
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X_train, y_train)



GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [69]:
# view the results
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.554878,0.018901,"{'n_neighbors': 1, 'weights': 'uniform'}"
1,0.554878,0.018901,"{'n_neighbors': 1, 'weights': 'distance'}"
2,0.482134,0.020512,"{'n_neighbors': 2, 'weights': 'uniform'}"
3,0.554878,0.018901,"{'n_neighbors': 2, 'weights': 'distance'}"
4,0.459162,0.025845,"{'n_neighbors': 3, 'weights': 'uniform'}"
...,...,...,...
993,0.616907,0.021816,"{'n_neighbors': 497, 'weights': 'distance'}"
994,0.456618,0.011086,"{'n_neighbors': 498, 'weights': 'uniform'}"
995,0.616651,0.021645,"{'n_neighbors': 498, 'weights': 'distance'}"
996,0.458150,0.010801,"{'n_neighbors': 499, 'weights': 'uniform'}"


In [70]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.6222682551281382
{'n_neighbors': 297, 'weights': 'distance'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=297, p=2,
                     weights='distance')


In [1]:
# plot the results
grid_mean_scores = grid.cv_results_['mean_test_score'] # array of mean scores only
print (k_range)
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

NameError: name 'grid' is not defined

**Randomized Search**

In [72]:
from sklearn.model_selection import RandomizedSearchCV

# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)

# n_iter controls the number of searches
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5, return_train_score=False)
rand.fit(X, y)
pd.DataFrame(rand.cv_results_)[['mean_test_score', 'std_test_score', 'params']]



Unnamed: 0,mean_test_score,std_test_score,params
0,0.441421,0.032169,"{'weights': 'uniform', 'n_neighbors': 273}"
1,0.447135,0.034057,"{'weights': 'distance', 'n_neighbors': 258}"
2,0.443866,0.033022,"{'weights': 'distance', 'n_neighbors': 97}"
3,0.3969,0.026331,"{'weights': 'distance', 'n_neighbors': 6}"
4,0.447334,0.032006,"{'weights': 'distance', 'n_neighbors': 140}"
5,0.438148,0.030748,"{'weights': 'uniform', 'n_neighbors': 327}"
6,0.439374,0.031347,"{'weights': 'uniform', 'n_neighbors': 322}"
7,0.437539,0.033517,"{'weights': 'uniform', 'n_neighbors': 382}"
8,0.438966,0.030979,"{'weights': 'uniform', 'n_neighbors': 100}"
9,0.438968,0.031397,"{'weights': 'uniform', 'n_neighbors': 493}"


In [73]:
# examine the best model
print(rand.best_score_)
print(rand.best_params_)
print(grid.best_estimator_)

0.44733358374024457
{'weights': 'distance', 'n_neighbors': 140}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=297, p=2,
                     weights='distance')


In [74]:
# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
best_scores = []
for _ in range(20):
    rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, return_train_score=False)
    rand.fit(X, y)
    best_scores.append(round(rand.best_score_, 3))
print(best_scores)



[0.448, 0.452, 0.451, 0.451, 0.451, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.449, 0.45, 0.45, 0.449, 0.447, 0.451, 0.451, 0.45, 0.45]
