data from https://www.kaggle.com/zalando-research/fashionmnist

In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import cv2
from sklearn.preprocessing import StandardScaler

In [12]:
train_data = pd.read_csv('fashion-mnist_train.csv')
test_data = pd.read_csv('fashion-mnist_test.csv')

X_train = train_data.iloc[:, 1:].values.astype(np.uint8)
y_train = train_data.iloc[:, 0].values.astype(np.uint8)

X_test = test_data.iloc[:, 1:].values.astype(np.uint8)
y_test = test_data.iloc[:, 0].values.astype(np.uint8)

In [13]:
n_samples = X_train.shape[0]
feat_list = []
for i in range(n_samples):
    img_data = X_train[i, :].reshape(28, 28)
    blur_img_data = cv2.medianBlur(img_data, 3)
    equ_blur_img_data = cv2.equalizeHist(blur_img_data)
    feat = equ_blur_img_data.flatten()
    feat_list.append(feat)
feats_train = np.array(feat_list)

In [14]:
n_samples = X_test.shape[0]
feat_list = []
for i in range(n_samples):
    img_data = X_test[i, :].reshape(28, 28)
    blur_img_data = cv2.medianBlur(img_data, 3)
    equ_blur_img_data = cv2.equalizeHist(blur_img_data)
    feat = equ_blur_img_data.flatten()
    feat_list.append(feat)
feats_test = np.array(feat_list)

In [15]:
std_scaler = StandardScaler()
proc_feats_train = std_scaler.fit_transform(feats_train.astype(np.float64))
proc_feats_test = std_scaler.transform(feats_test.astype(np.float64))

In [16]:
def train_model(X_train, y_train, X_test, y_test, model_name, model, param_range):

    clf = GridSearchCV(estimator=model,
                       param_grid=param_range,
                       cv=3,
                       scoring='accuracy',
                       refit=True)

    clf.fit(X_train, y_train)

    print('train acc：{:.3f}'.format(clf.score(X_train, y_train)))

    score = clf.score(X_test, y_test)
    print('test acc：{:.3f}'.format(score))

    return clf, score, duration

In [None]:

model_name_param_dict = {'kNN': (KNeighborsClassifier(),
                                 {'n_neighbors': [5, 25, 55]}),
                         'LR': (LogisticRegression(),
                                {'C': [0.01, 1, 100]}),
                         'SVM': (SVC(kernel='linear'),
                                 {'C': [0.01, 1, 100]}),
                         'DT': (DecisionTreeClassifier(),
                                {'max_depth': [50, 100, 150]}),
                         'AdaBoost': (AdaBoostClassifier(),
                                      {'n_estimators': [100, 150, 200]}),
                         'GBDT': (GradientBoostingClassifier(),
                                  {'learning_rate': [0.01, 1, 100]}),
                         'RF': (RandomForestClassifier(),
                                {'n_estimators': [100, 150, 200]})}


results_df = pd.DataFrame(columns=['Accuracy (%)', 'Time (s)'],
                          index=list(model_name_param_dict.keys()))
results_df.index.name = 'Model'

for model_name, (model, param_range) in model_name_param_dict.items():
    best_clf, best_acc, mean_duration = train_model(proc_feats_train, y_train, proc_feats_test, y_test,
                                                    model_name, model, param_range)
    results_df.loc[model_name, 'Accuracy (%)'] = best_acc * 100
    results_df.loc[model_name, 'Time (s)'] = mean_duration

results_df