In [262]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

In [263]:
df = pd.read_csv("data/mnist_test.csv")
df

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [264]:
# Check dirty data
df.isnull().sum()

label    0
1x1      0
1x2      0
1x3      0
1x4      0
        ..
28x24    0
28x25    0
28x26    0
28x27    0
28x28    0
Length: 785, dtype: int64

In [265]:
# Split the dataset form target
x, y = df.drop(['label'], axis=1), df['label']

In [266]:
x

Unnamed: 0,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,1x10,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [267]:
y

0       7
1       2
2       1
3       0
4       4
       ..
9995    2
9996    3
9997    4
9998    5
9999    6
Name: label, Length: 10000, dtype: int64

In [268]:
# Scaling using standardScalr
scaler = StandardScaler()
scaler.fit_transform(x)
x=scaler.transform(x)

x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [269]:
# Split the dataset into 5 subsets of equal size
kf5 = KFold(n_splits=5, shuffle = True, random_state=1)

In [270]:
def gridSearch(dataX, dataY, k):
    i = 1
    for train, test in kf5.split(dataX):
        print("Split " + str(i))

        x_train, x_test = dataX[train], dataX[test]
        y_train, y_test = dataY[train], dataY[test]

        knnCV = KNeighborsClassifier(n_neighbors=k)

        scores = cross_val_score(knnCV, x_train, y_train, cv=5)
        print("Case : initial K = 5")
        print(scores)
        print("scores mean : {}".format(np.mean(scores)))

        knn = KNeighborsClassifier()

        parameter = {'n_neighbors': np.arange(1, 25)}

        knnGS = GridSearchCV(knn, param_grid=parameter, cv=5)

        knnGS.fit(x_train, y_train)

        print("Best Hyperparameter computing by GridSearch")
        print(knnGS.best_params_)
        print(knnGS.best_score_)
        print('\n')

        i = i + 1

In [271]:
def randomGridSearch(dataX, dataY, k):
    i = 1
    for train, test in kf5.split(dataX):
        print("Split " + str(i))

        x_train, x_test = dataX[train], dataX[test]
        y_train, y_test = dataY[train], dataY[test]

        knnCV = KNeighborsClassifier(n_neighbors=k)

        scores = cross_val_score(knnCV, x_train, y_train, cv=5)
        print("Case : initial K = 5")
        print(scores)
        print("scores mean : {}".format(np.mean(scores)))

        parameter = {'n_neighbors': np.arange(1, 25)}

        knn = KNeighborsClassifier()
        knnRS = RandomizedSearchCV(knn, parameter, cv=5, scoring="accuracy")

        knnRS.fit(x_train, y_train)

        print("Best Hyperparameter computing by Randomize GridSearch")
        print(knnRS.best_params_)
        print(knnRS.best_score_)
        print('\n')

        i = i + 1

In [272]:
# k=3, GridSearch
print("Case : K = 3, GridSearch\n")
gridSearch(x, y, 3)

Case : K = 3, GridSearch

Split 1
Case : initial K = 5
[0.8625   0.896875 0.896875 0.919375 0.9175  ]
scores mean : 0.898625
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.898625


Split 2
Case : initial K = 5
[0.870625 0.8875   0.901875 0.924375 0.9275  ]
scores mean : 0.9023749999999999
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.9023749999999999


Split 3
Case : initial K = 5
[0.869375 0.89125  0.89875  0.9275   0.909375]
scores mean : 0.89925
Best Hyperparameter computing by GridSearch
{'n_neighbors': 4}
0.899625


Split 4
Case : initial K = 5
[0.869375 0.8875   0.898125 0.91875  0.92625 ]
scores mean : 0.9
Best Hyperparameter computing by GridSearch
{'n_neighbors': 4}
0.90025


Split 5
Case : initial K = 5
[0.8725   0.890625 0.901875 0.930625 0.920625]
scores mean : 0.9032500000000001
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.9032500000000001




In [273]:
# k=5, GridSearch
print("Case : K = 5, GridSearch\n")
gridSearch(x, y, 5)

Case : K = 5, GridSearch

Split 1
Case : initial K = 5
[0.86     0.884375 0.899375 0.919375 0.911875]
scores mean : 0.8949999999999999
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.898625


Split 2
Case : initial K = 5
[0.87     0.886875 0.89625  0.92875  0.920625]
scores mean : 0.9005000000000001
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.9023749999999999


Split 3
Case : initial K = 5
[0.86625  0.889375 0.89625  0.93     0.914375]
scores mean : 0.89925
Best Hyperparameter computing by GridSearch
{'n_neighbors': 4}
0.899625


Split 4
Case : initial K = 5
[0.871875 0.879375 0.896875 0.93     0.921875]
scores mean : 0.9
Best Hyperparameter computing by GridSearch
{'n_neighbors': 4}
0.90025


Split 5
Case : initial K = 5
[0.86875 0.88875 0.9     0.9325  0.915  ]
scores mean : 0.9010000000000001
Best Hyperparameter computing by GridSearch
{'n_neighbors': 3}
0.9032500000000001




In [274]:
# k=3 Randomized GridSearch
print("Case : K = 3, Randomized GridSearch\n")
randomGridSearch(x, y, 3)

Case : K = 3, Randomized GridSearch

Split 1
Case : initial K = 5
[0.8625   0.896875 0.896875 0.919375 0.9175  ]
scores mean : 0.898625
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 6}
0.895875


Split 2
Case : initial K = 5
[0.870625 0.8875   0.901875 0.924375 0.9275  ]
scores mean : 0.9023749999999999
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 5}
0.9005000000000001


Split 3
Case : initial K = 5
[0.869375 0.89125  0.89875  0.9275   0.909375]
scores mean : 0.89925
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 5}
0.89925


Split 4
Case : initial K = 5
[0.869375 0.8875   0.898125 0.91875  0.92625 ]
scores mean : 0.9
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 3}
0.9


Split 5
Case : initial K = 5
[0.8725   0.890625 0.901875 0.930625 0.920625]
scores mean : 0.9032500000000001
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 3}
0.9032500000000001




In [275]:
# k=5 Randomized GridSearch
print("Case : K = 5, Randomized GridSearch\n")
randomGridSearch(x, y, 5)

Case : K = 5, Randomized GridSearch

Split 1
Case : initial K = 5
[0.86     0.884375 0.899375 0.919375 0.911875]
scores mean : 0.8949999999999999
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 4}
0.8952500000000001


Split 2
Case : initial K = 5
[0.87     0.886875 0.89625  0.92875  0.920625]
scores mean : 0.9005000000000001
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 3}
0.9023749999999999


Split 3
Case : initial K = 5
[0.86625  0.889375 0.89625  0.93     0.914375]
scores mean : 0.89925
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 3}
0.89925


Split 4
Case : initial K = 5
[0.871875 0.879375 0.896875 0.93     0.921875]
scores mean : 0.9
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 6}
0.9001250000000001


Split 5
Case : initial K = 5
[0.86875 0.88875 0.9     0.9325  0.915  ]
scores mean : 0.9010000000000001
Best Hyperparameter computing by Randomize GridSearch
{'n_neighbors': 3}
0.903250000