### setup

In [1]:
# Common imports
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]
    
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
mnist["data"], mnist["target"]

(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([0, 0, 0, ..., 9, 9, 9], dtype=int8))

### Exercises
### 1.

In [7]:
X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)#shuffle data
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn_clf = KNeighborsClassifier(n_jobs=-1)
k_range = list(range(1, 11))
weight_options = ['uniform', 'distance']
param_grid = dict(n_neighbors=k_range, weights=weight_options)

grid = GridSearchCV(knn_clf, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)
#took too long

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
knn_clf = KNeighborsClassifier(n_jobs=-1)
k_range = list(range(3, 6))
weight_options = ['uniform', 'distance']
param_grid = dict(n_neighbors=k_range, weights=weight_options)
rand=RandomizedSearchCV(knn_clf, param_grid, cv=5,scoring='accuracy',n_iter=10,random_state=42,verbose=10)
rand.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] weights=uniform, n_neighbors=3 ..................................
[CV]  weights=uniform, n_neighbors=3, score=0.9717617659308622, total= 3.8min
[CV] weights=uniform, n_neighbors=3 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 15.3min remaining:    0.0s


[CV]  weights=uniform, n_neighbors=3, score=0.9701716380603232, total= 3.1min
[CV] weights=uniform, n_neighbors=3 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 30.2min remaining:    0.0s


[CV]  weights=uniform, n_neighbors=3, score=0.9703333333333334, total= 3.1min
[CV] weights=uniform, n_neighbors=3 ..................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 44.7min remaining:    0.0s


[CV]  weights=uniform, n_neighbors=3, score=0.9754105192964908, total= 3.1min
[CV] weights=uniform, n_neighbors=3 ..................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 59.4min remaining:    0.0s


[CV]  weights=uniform, n_neighbors=3, score=0.9720740246748917, total= 3.0min
[CV] weights=distance, n_neighbors=3 .................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 73.7min remaining:    0.0s


[CV]  weights=distance, n_neighbors=3, score=0.9729279466888796, total= 2.9min
[CV] weights=distance, n_neighbors=3 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 87.7min remaining:    0.0s


[CV]  weights=distance, n_neighbors=3, score=0.9702549575070821, total= 3.0min
[CV] weights=distance, n_neighbors=3 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 101.7min remaining:    0.0s


[CV]  weights=distance, n_neighbors=3, score=0.9709166666666667, total= 3.0min
[CV] weights=distance, n_neighbors=3 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 115.6min remaining:    0.0s


[CV]  weights=distance, n_neighbors=3, score=0.9761607068433775, total= 2.9min
[CV] weights=distance, n_neighbors=3 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 129.7min remaining:    0.0s


[CV]  weights=distance, n_neighbors=3, score=0.9735745248416139, total= 2.9min
[CV] weights=uniform, n_neighbors=4 ..................................
[CV]  weights=uniform, n_neighbors=4, score=0.9699291961682632, total= 2.9min
[CV] weights=uniform, n_neighbors=4 ..................................
[CV]  weights=uniform, n_neighbors=4, score=0.9689218463589402, total= 2.9min
[CV] weights=uniform, n_neighbors=4 ..................................
[CV] .... weights=uniform, n_neighbors=4, score=0.96775, total= 3.0min
[CV] weights=uniform, n_neighbors=4 ..................................
[CV]  weights=uniform, n_neighbors=4, score=0.9729098941402017, total= 2.9min
[CV] weights=uniform, n_neighbors=4 ..................................
[CV]  weights=uniform, n_neighbors=4, score=0.9706568856285428, total= 2.9min
[CV] weights=distance, n_neighbors=4 .................................
[CV]  weights=distance, n_neighbors=4, score=0.9735110370678883, total= 2.9min
[CV] weights=distance, n_neighbor

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 425.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_neighbors': [3, 4, 5], 'weights': ['uniform', 'distance']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=10)

In [10]:
rand.best_params_, rand.best_score_

({'weights': 'distance', 'n_neighbors': 4}, 0.97375)

In [11]:
from sklearn.metrics import accuracy_score
y_p= rand.predict(X_test)
accuracy_score(y_test,y_p)

0.97140000000000004