In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")

In [5]:
from tensorflow.keras.datasets import fashion_mnist

(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

print('Training data:', X_train.shape, y_train.shape)
print('Test data:', X_test.shape, y_test.shape)

Training data: (60000, 28, 28) (60000,)
Test data: (10000, 28, 28) (10000,)


In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from skimage.feature import hog

In [7]:
class HogTransformer(BaseEstimator, TransformerMixin):
    """
    Expects an array of 2d arrays (1 channel images)
    Calculates hog features for each img
    """
 
    def __init__(self, y=None, orientations=9,
                 pixels_per_cell=(8, 8),
                 cells_per_block=(3, 3), block_norm='L2-Hys'):
        self.y = y
        self.orientations = orientations
        self.pixels_per_cell = pixels_per_cell
        self.cells_per_block = cells_per_block
        self.block_norm = block_norm
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X, y=None):
 
        def local_hog(X):
            return hog(X,
                       orientations=self.orientations,
                       pixels_per_cell=self.pixels_per_cell,
                       cells_per_block=self.cells_per_block,
                       block_norm=self.block_norm)
 
        try: # parallel
            return np.array([local_hog(img) for img in X])
        except:
            return np.array([local_hog(img) for img in X])

In [8]:
class FlattenTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.reshape((-1, 784))/255

In [12]:
lrg_pipeline = Pipeline(steps=[
    ("flatten_transformer", FlattenTransformer()),
    ("standard_scaler", StandardScaler()),
    ("logistic_reg", LogisticRegression(penalty='l2', random_state=100, solver='lbfgs', max_iter=200))
])

In [13]:
lrg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('flatten_transformer', FlattenTransformer()),
                ('standard_scaler', StandardScaler()),
                ('logistic_reg',
                 LogisticRegression(max_iter=200, random_state=100))])

In [17]:
lrg_pred = lrg_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, lrg_pred))

Accuracy: 0.8382


In [19]:
lrg_pred_train = lrg_pipeline.predict(X_train)
print('Accuracy:', accuracy_score(y_train, lrg_pred_train))

Accuracy: 0.8817333333333334


In [10]:
hogger = HogTransformer(
    pixels_per_cell=(4,4),
    cells_per_block=(2,2),
    orientations=9,
    block_norm='L2-Hys'
)

lrg_hog_pipeline = Pipeline(steps=[
    ("hog_transform", hogger),
    #("flatten_transformer", FlattenTransformer()),
    ("standard_scaler", StandardScaler()),
    ("logistic_reg", LogisticRegression(penalty='l2', random_state=100, solver='lbfgs', max_iter=200))
])

In [23]:
lrg_hog_pipeline.fit(X_train, y_train)

Pipeline(steps=[('hog_transform',
                 HogTransformer(cells_per_block=(2, 2),
                                pixels_per_cell=(4, 4))),
                ('standard_scaler', StandardScaler()),
                ('logistic_reg',
                 LogisticRegression(max_iter=200, random_state=100))])

In [24]:
lrg_hog_pred = lrg_hog_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, lrg_hog_pred))

Accuracy: 0.8772


In [25]:
lrg_hog_pred_train = lrg_hog_pipeline.predict(X_train)
print('Accuracy:', accuracy_score(y_train, lrg_hog_pred_train))

Accuracy: 0.93165


# SVM

In [18]:
from sklearn.svm import SVC

In [19]:
svc = SVC(C=10, kernel='rbf')

svc_hog_pipeline = Pipeline(steps=[
    ("hog_transform", hogger),
    ("standard_scaler", StandardScaler()),
    ("svc", svc)
])

**Warning**: Do not run this, averages 30 mins

In [20]:
svc_hog_pipeline.fit(X_train, y_train)

Pipeline(steps=[('hog_transform',
                 HogTransformer(cells_per_block=(2, 2),
                                pixels_per_cell=(4, 4))),
                ('standard_scaler', StandardScaler()), ('svc', SVC(C=10))])

In [21]:
svc_hog_pred = svc_hog_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, svc_hog_pred))

Accuracy: 0.9148


# Linear SVC

In [2]:
from sklearn.svm import LinearSVC

In [12]:
lsvc = LinearSVC(C=1, loss='hinge', multi_class='ovr', penalty='l2')


lsvc_hog_pipeline = Pipeline(steps=[
    ("hog_transform", hogger),
    ("standard_scaler", StandardScaler()),
    ("svc", lsvc)
])

In [13]:
lsvc_hog_pipeline.fit(X_train, y_train)

Pipeline(steps=[('hog_transform',
                 HogTransformer(cells_per_block=(2, 2),
                                pixels_per_cell=(4, 4))),
                ('standard_scaler', StandardScaler()),
                ('svc', LinearSVC(C=1, loss='hinge'))])

In [14]:
lsvc_hog_pred = lsvc_hog_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, lsvc_hog_pred))

Accuracy: 0.8735


## Different HOG func

In [15]:
hogger2 = HogTransformer(
    pixels_per_cell=(4,4),
    cells_per_block=(3,3),
    orientations=9,
    block_norm='L2-Hys'
)

lsvc_hog2_pipeline = Pipeline(steps=[
    ("hog_transform", hogger2),
    ("standard_scaler", StandardScaler()),
    ("lsvc", lsvc)
])

In [16]:
lsvc_hog2_pipeline.fit(X_train, y_train)

Pipeline(steps=[('hog_transform', HogTransformer(pixels_per_cell=(4, 4))),
                ('standard_scaler', StandardScaler()),
                ('lsvc', LinearSVC(C=1, loss='hinge'))])

In [17]:
lsvc_hog2_pred = lsvc_hog2_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, lsvc_hog2_pred))

Accuracy: 0.8666


# Pickle Errthing

In [23]:
import pickle

pickle.dump(svc_hog_pipeline, open('svc_hog_mnist.pkl', 'wb'))

# XGBoost

In [39]:
xgbc = xgb.XGBClassifier(objective="multi:softmax",
                                learning_rate=0.08,
                                max_depth=6,
                                subsample=0.8,
                                colsample_bytree=0.8,
                                reg_alpha=8,
                                n_estimators=600,
                                reg_lambda=2)

prep_xgb_pipeline = Pipeline(steps=[
    ("hog_transform", hogger),
    ("standard_scaler", StandardScaler())
])

xgb_pipeline = Pipeline(steps=[
    ("hog_transform", hogger),
    ("standard_scaler", StandardScaler()),
    ("xgb", xgbc)
])

In [40]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train, y_train, test_size=0.1)

In [41]:
X_test_s_prepped = prep_xgb_pipeline.fit_transform(X_test_s)
X_train_s_prepped = prep_xgb_pipeline.fit_transform(X_train_s)

In [43]:
eval_set = [(X_train_s_prepped, y_train_s), (X_test_s_prepped, y_test_s)]
xgb_pipeline.fit(X_train_s, y_train_s, xgb__eval_metric="merror", xgb__eval_set=eval_set, xgb__early_stopping_rounds=50, xgb__verbose=True)

[0]	validation_0-merror:0.18117	validation_1-merror:0.25217
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 50 rounds.
[1]	validation_0-merror:0.15720	validation_1-merror:0.21450
[2]	validation_0-merror:0.14980	validation_1-merror:0.20817
[3]	validation_0-merror:0.14526	validation_1-merror:0.20200
[4]	validation_0-merror:0.13985	validation_1-merror:0.19717
[5]	validation_0-merror:0.13622	validation_1-merror:0.19633
[6]	validation_0-merror:0.13396	validation_1-merror:0.19500
[7]	validation_0-merror:0.13198	validation_1-merror:0.19233
[8]	validation_0-merror:0.13043	validation_1-merror:0.19250
[9]	validation_0-merror:0.12859	validation_1-merror:0.19167
[10]	validation_0-merror:0.12691	validation_1-merror:0.19083
[11]	validation_0-merror:0.12526	validation_1-merror:0.19050
[12]	validation_0-merror:0.12344	validation_1-merror:0.18933
[13]	validation_0-merror:0.12230	validation_1-merror:0

[132]	validation_0-merror:0.04181	validation_1-merror:0.12383
[133]	validation_0-merror:0.04144	validation_1-merror:0.12400
[134]	validation_0-merror:0.04091	validation_1-merror:0.12450
[135]	validation_0-merror:0.04043	validation_1-merror:0.12450
[136]	validation_0-merror:0.03983	validation_1-merror:0.12367
[137]	validation_0-merror:0.03952	validation_1-merror:0.12350
[138]	validation_0-merror:0.03920	validation_1-merror:0.12450
[139]	validation_0-merror:0.03872	validation_1-merror:0.12400
[140]	validation_0-merror:0.03850	validation_1-merror:0.12367
[141]	validation_0-merror:0.03813	validation_1-merror:0.12317
[142]	validation_0-merror:0.03778	validation_1-merror:0.12317
[143]	validation_0-merror:0.03739	validation_1-merror:0.12367
[144]	validation_0-merror:0.03700	validation_1-merror:0.12317
[145]	validation_0-merror:0.03657	validation_1-merror:0.12300
[146]	validation_0-merror:0.03633	validation_1-merror:0.12283
[147]	validation_0-merror:0.03583	validation_1-merror:0.12250
[148]	va

[265]	validation_0-merror:0.01072	validation_1-merror:0.11400
[266]	validation_0-merror:0.01065	validation_1-merror:0.11367
[267]	validation_0-merror:0.01052	validation_1-merror:0.11367
[268]	validation_0-merror:0.01046	validation_1-merror:0.11350
[269]	validation_0-merror:0.01028	validation_1-merror:0.11250
[270]	validation_0-merror:0.01022	validation_1-merror:0.11300
[271]	validation_0-merror:0.01017	validation_1-merror:0.11283
[272]	validation_0-merror:0.00993	validation_1-merror:0.11300
[273]	validation_0-merror:0.00978	validation_1-merror:0.11317
[274]	validation_0-merror:0.00970	validation_1-merror:0.11367
[275]	validation_0-merror:0.00967	validation_1-merror:0.11350
[276]	validation_0-merror:0.00950	validation_1-merror:0.11333
[277]	validation_0-merror:0.00944	validation_1-merror:0.11350
[278]	validation_0-merror:0.00933	validation_1-merror:0.11333
[279]	validation_0-merror:0.00924	validation_1-merror:0.11317
[280]	validation_0-merror:0.00919	validation_1-merror:0.11300
[281]	va

Pipeline(steps=[('hog_transform',
                 HogTransformer(cells_per_block=(2, 2),
                                pixels_per_cell=(4, 4))),
                ('standard_scaler', StandardScaler()),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.8, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=0.08,
                               max_delta_step=0, max_depth=6,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=600,
                               n_jobs=0, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=8, reg_lambda=2, scale_pos_weight=None,
         

In [45]:
xgb_pred = xgb_pipeline.predict(X_test)
print('Accuracy:', accuracy_score(y_test, xgb_pred))

Accuracy: 0.899


In [46]:
pickle.dump(xgb_pipeline, open('xgb_pipeline.pkl', 'wb'))