In [14]:
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix

In [15]:
mnist = fetch_openml('mnist_784', version = 1)

In [16]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [17]:
X, y =mnist.data, mnist.target
X.shape, y.shape

((70000, 784), (70000,))

In [18]:
X.min(), X.max()

(0.0, 255.0)

In [19]:
X = X/255.0

In [20]:
X.min(), X.max()

(0.0, 1.0)

In [21]:
x_train, x_val, y_train, y_val = train_test_split(X,y, test_size = 0.12, random_state =42)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((61600, 784), (8400, 784), (61600,), (8400,))

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train, test_size = 0.12, random_state =42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((54208, 784), (7392, 784), (54208,), (7392,))

In [23]:
knc = KNeighborsClassifier(n_jobs = -1)

In [24]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
preds = knc.predict(x_val)

In [26]:
accuracy_score(y_val, preds)

0.9686904761904762

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 6]}]

In [29]:
knn_clf = KNeighborsClassifier(n_jobs = -1)

In [33]:
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs =-1)

In [34]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed: 78.8min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 79.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'n_neighbors': [3, 4, 6],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [35]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
                     weights='distance')

In [36]:
grid_search.best_score_

0.9724211349914201

In [37]:
grid_search.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [38]:
best_KNN_CLF = grid_search.best_estimator_

In [40]:
best_test_preds = best_KNN_CLF.predict(x_test)

In [41]:
accuracy_score(y_test, best_test_preds)

0.974025974025974

In [43]:
import pandas as pd
from sklearn.metrics import classification_report

In [46]:
report = classification_report(y_test, best_test_preds, output_dict = True)

In [47]:
report_df = pd.DataFrame(report)

In [48]:
report_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,accuracy,macro avg,weighted avg
precision,0.974425,0.963743,0.984463,0.974185,0.987143,0.97151,0.98103,0.967871,0.987569,0.95,0.974026,0.974194,0.974203
recall,0.993481,0.997579,0.970752,0.958556,0.973239,0.961918,0.986376,0.974394,0.948276,0.972222,0.974026,0.973679,0.974026
f1-score,0.983861,0.980369,0.97756,0.966307,0.980142,0.96669,0.983696,0.971122,0.967524,0.960983,0.974026,0.973825,0.974
support,767.0,826.0,718.0,748.0,710.0,709.0,734.0,742.0,754.0,684.0,0.974026,7392.0,7392.0
