In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# read data
df = pd.read_csv('data/mnist_train.csv')

# check dirty data
print("\nCheck dirty data")
print(df.isnull().sum())
print("\n\n")
# result : no dirty data

# set independent, target
y = df['label']
X = df.drop(['label'], axis=1)

# standard scaling
stdscaler = preprocessing.StandardScaler()
stdscaler.fit(X)
X=stdscaler.transform(X)

# cross validation
kf5 = KFold(n_splits=5, shuffle = True, random_state=1)

# case : K = 3, GridSearch
print("Case : K = 3, GridSearch\n")
i=1
for train, test in kf5.split(X):

    print("Split " + str(i) + "\n")
    i+=1
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    knn_cv = KNeighborsClassifier(n_neighbors=3)

    scores = cross_val_score(knn_cv,X_train,y_train,cv=5)
    print("Case : initial K = 3")
    print(scores)
    print("scores mean : {}".format(np.mean(scores)))

    knn = KNeighborsClassifier()

    param = {'n_neighbors':np.arange(1,25)}

    knn_gs = GridSearchCV(knn, param_grid=param, cv=5)

    knn_gs.fit(X_train,y_train)

    print("\nCase : Hyperparameter tuning by GridSearch")
    print(knn_gs.best_params_)
    print(knn_gs.best_score_)
    print("\n")

print("\n\n")

# case : K = 3, RandomizedGridSearch
print("Case : K = 3, RandomizedGridSearch\n")
i=1
for train, test in kf5.split(X):

    print("Split " + str(i) + "\n")
    i+=1
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    knn_cv = KNeighborsClassifier(n_neighbors=3)

    scores = cross_val_score(knn_cv,X_train,y_train,cv=5)
    print("Case : initial K = 3")
    print(scores)
    print("scores mean : {}".format(np.mean(scores)))

    knn = KNeighborsClassifier()

    param = {'n_neighbors':np.arange(1,25)}

    knn_gs = RandomizedSearchCV(knn, param, cv=5, scoring='accuracy')

    knn_gs.fit(X_train,y_train)

    print("\nCase : Hyperparameter tuning by Randomized GridSearch")
    print(knn_gs.best_params_)
    print(knn_gs.best_score_)
    print("\n")

print("\n\n")

# case : K = 5, GridSearch
print("Case : K = 5, GridSearch\n")
i=1
for train, test in kf5.split(X):

    print("Split " + str(i) + "\n")
    i+=1
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    knn_cv = KNeighborsClassifier(n_neighbors=5)

    scores = cross_val_score(knn_cv,X_train,y_train,cv=5)
    print("Case : initial K = 5")
    print(scores)
    print("scores mean : {}".format(np.mean(scores)))

    knn = KNeighborsClassifier()

    param = {'n_neighbors':np.arange(1,25)}

    knn_gs = GridSearchCV(knn, param_grid=param, cv=5)

    knn_gs.fit(X_train,y_train)

    print("\nCase : Hyperparameter tuning by GridSearch")
    print(knn_gs.best_params_)
    print(knn_gs.best_score_)
    print("\n")

print("\n\n")

# case : K = 5, RandomizedGridSearch
print("Case : K = 5, RandomizedGridSearch\n")
i=1
for train, test in kf5.split(X):

    print("Split " + str(i) + "\n")
    i+=1
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

    knn_cv = KNeighborsClassifier(n_neighbors=5)

    scores = cross_val_score(knn_cv,X_train,y_train,cv=5)
    print("Case : initial K = 5")
    print(scores)
    print("scores mean : {}".format(np.mean(scores)))

    knn = KNeighborsClassifier()

    param = {'n_neighbors':np.arange(1,25)}

    knn_gs = RandomizedSearchCV(knn, param, cv=5, scoring='accuracy')

    knn_gs.fit(X_train,y_train)

    print("\nCase : Hyperparameter tuning by Randomized GridSearch")
    print(knn_gs.best_params_)
    print(knn_gs.best_score_)
    print("\n")


Check dirty data
label    0
1x1      0
1x2      0
1x3      0
1x4      0
        ..
28x24    0
28x25    0
28x26    0
28x27    0
28x28    0
Length: 785, dtype: int64



Case : K = 3, GridSearch

Split 1

Case : initial K = 3
[0.940625   0.9403125  0.94       0.93302083 0.94364583]
scores mean : 0.9395208333333332

Case : Hyperparameter tuning by GridSearch
{'n_neighbors': 3}
0.9395208333333332


Split 2

Case : initial K = 3
[0.93583333 0.94114583 0.93989583 0.93697917 0.9425    ]
scores mean : 0.9392708333333333

Case : Hyperparameter tuning by GridSearch
{'n_neighbors': 3}
0.9392708333333333


Split 3

Case : initial K = 3
[0.9371875  0.93604167 0.9428125  0.934375   0.9425    ]
scores mean : 0.9385833333333332

Case : Hyperparameter tuning by GridSearch
{'n_neighbors': 3}
0.9385833333333332


Split 4

Case : initial K = 3
[0.93833333 0.94041667 0.94041667 0.93510417 0.94666667]
scores mean : 0.9401875000000001

Case : Hyperparameter tuning by GridSearch
{'n_neighbors': 3}
0.940187500