In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

In [2]:
df = pd.DataFrame(np.load('Data/200/sample_200.npy', allow_pickle = True))
df = df.rename(columns = {784:'label'})

In [6]:
img = df[df.columns[0:784]].values.astype(float) / 255
label = df['label'].values
x_train, x_test, y_train, y_test = train_test_split(img, label, test_size = 0.2, train_size = 0.6, random_state = 123, stratify = label)

In [8]:
pipe = make_pipeline(PCA(),
                     KNeighborsClassifier())


param_grid = {'pca__n_components': [100, 200, 256],
               'kneighborsclassifier__n_neighbors': np.arange(5,50, step = 5)}

gs = GridSearchCV(pipe,
                       param_grid=param_grid,
                       refit=True,
                       iid=False,
                       cv=5,
                       n_jobs=-1,
                       verbose=1)

gs.fit(x_train, y_train)

print('best params:')
print(gs.best_params_)

print('train acc:')
print(gs.score(x_train, y_train))

print('test acc:')
print(gs.score(x_test, y_test))


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 48.9min finished


best params:
{'kneighborsclassifier__n_neighbors': 15, 'pca__n_components': 100}
train acc:
0.34381642512077293
test acc:
0.22978260869565217
