# Fashion MNIST 

In [11]:
%matplotlib inline 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, accuracy_score

## 최종 score 

```python
# 3차 다항식의 non-linear SVM model을 사용하였습니다. 
poly_svm_clf2 = Pipeline([
                        ("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=10, random_state=2020)),
                        ])
```

In [71]:
# confusion matrix 
pd.crosstab(y_test, poly_svm_bagging_pred2, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,880,2,13,21,1,2,69,0,12,0,1000
1,2,980,4,10,0,0,4,0,0,0,1000
2,18,0,851,10,72,1,46,0,2,0,1000
3,32,12,14,903,25,0,13,0,1,0,1000
4,3,1,81,31,838,0,44,0,2,0,1000
5,2,0,0,1,0,928,1,43,4,21,1000
6,181,0,103,22,62,0,621,0,11,0,1000
7,0,0,0,0,0,27,0,934,0,39,1000
8,4,0,9,4,4,3,10,2,964,0,1000
9,0,0,0,0,0,14,0,33,0,953,1000


In [72]:
accuracy_score(poly_svm_bagging_pred2, y_test)

0.8852

## Data Load 

In [2]:
train = pd.read_csv("fashion-mnist_train.csv")
test  = pd.read_csv("fashion-mnist_test.csv")

In [4]:
train.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_train = train.drop('label', axis=1)
X_test  = test.drop('label', axis=1)

In [6]:
y_train = train.label 
y_test  = test.label 

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60000, 784), (10000, 784), (60000,), (10000,))

## Modeling 
* **SVM**은 성능이 좋지만, 시간/공간복잡도가 몹시 높다는 단점이 있습니다. 
* 이러한 단점을 극복하기 위해 앙상블(Ensemble) 기법을 이용하여 SVM의 성능을 유지 또는 향상시키고, 비용을 낮추었습니다. 
* **Bagging** (또는 bootstrap aggregating)은 앙상블 기법 중의 하나로, 훈련데이터(training data)에서 복원추출로 작은데이터 집합을 여러 만들고, 각 데이터집합으로 분류모델(classifier)를 생성하여, 시험데이터(test data)를 예측할 때 다수결원칙(voting)으로 클래스(class)를 결정합니다.
* SVM의 시간복잡도는 O(n^3)으로 굉장히 높기 때문에, Bagging을 통해 훈련 튜플(training tuple)의 수(n)를 줄이고, 병렬처리를 적용해 시간단축을 하였습니다. 
* (출처 : https://m.blog.naver.com/cjh226/221359032956) 

### 1. Linear SVM 

In [12]:
svm_clf = Pipeline([
                    ("scaler", StandardScaler()),
                    ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=2020)),
                    ])

In [10]:
svm_clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linear_svc',
                 LinearSVC(C=1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=2020, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [12]:
svm_clf_pred = svm_clf.predict(X_test)

In [17]:
# confusion matrix 
pd.crosstab(y_test, svm_clf_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,811,6,16,69,4,2,76,2,14,0,1000
1,4,967,4,16,2,1,5,1,0,0,1000
2,20,6,730,11,143,0,79,0,11,0,1000
3,26,14,15,884,29,0,24,2,6,0,1000
4,1,3,60,32,812,1,88,1,1,1,1000
5,1,4,2,1,0,901,1,55,10,25,1000
6,166,4,95,58,91,2,561,3,19,1,1000
7,0,0,0,1,0,35,0,917,2,45,1000
8,6,2,8,16,4,15,26,5,916,2,1000
9,0,0,2,0,1,16,0,39,3,939,1000


In [20]:
accuracy_score(svm_clf_pred, y_test)

0.8438

#### linear SVM + Bagging 

In [14]:
n_estimators = 10
svm_clf_bagging = BaggingClassifier(base_estimator=svm_clf,
                                    n_estimators=n_estimators,
                                    max_samples=1./n_estimators,
                                    n_jobs=-1)

In [15]:
%%time
svm_clf_bagging.fit(X_train, y_train)

CPU times: user 113 ms, sys: 239 ms, total: 353 ms
Wall time: 50.9 s


BaggingClassifier(base_estimator=Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('linear_svc',
                                                  LinearSVC(C=1,
                                                            class_weight=None,
                                                            dual=True,
                                                            fit_intercept=True,
                                                            intercept_scaling=1,
                                                            loss='hinge',
                                                            max_iter=1000,
                                                         

In [16]:
svm_clf_bagging_pred = svm_clf_bagging.predict(X_test)

In [17]:
# confusion matrix 
pd.crosstab(y_test, svm_clf_bagging_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,829,9,24,44,5,7,68,1,13,0,1000
1,6,967,6,13,0,1,6,1,0,0,1000
2,24,1,766,13,114,3,62,3,14,0,1000
3,42,27,16,854,32,9,18,1,1,0,1000
4,7,6,104,36,770,2,73,2,0,0,1000
5,3,5,2,6,1,882,4,62,7,28,1000
6,178,13,115,47,104,10,509,6,18,0,1000
7,0,0,0,1,0,45,1,916,0,37,1000
8,18,2,19,15,10,13,30,11,880,2,1000
9,0,0,2,1,0,20,1,45,1,930,1000


In [18]:
accuracy_score(svm_clf_bagging_pred, y_test)

0.8303

#### linear SVM + Bagging + control penalty parameter C

In [41]:
svm_clf2 = Pipeline([
                    ("scaler", StandardScaler()),
                    ("linear_svc", LinearSVC(C=50, loss="hinge", random_state=2020)),
                    ])

In [42]:
svm_clf_bagging2 = BaggingClassifier(base_estimator=svm_clf2,
                                     n_estimators=n_estimators,
                                     max_samples=1./n_estimators,
                                     n_jobs=-1)

In [43]:
%%time
svm_clf_bagging2.fit(X_train, y_train)

CPU times: user 73.1 ms, sys: 114 ms, total: 187 ms
Wall time: 55.6 s


BaggingClassifier(base_estimator=Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('linear_svc',
                                                  LinearSVC(C=50,
                                                            class_weight=None,
                                                            dual=True,
                                                            fit_intercept=True,
                                                            intercept_scaling=1,
                                                            loss='hinge',
                                                            max_iter=1000,
                                                        

In [44]:
svm_clf_bagging_pred2 = svm_clf_bagging2.predict(X_test)

In [45]:
# confusion matrix 
pd.crosstab(y_test, svm_clf_bagging_pred2, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,826,4,25,52,7,3,69,3,11,0,1000
1,7,965,4,16,2,2,4,0,0,0,1000
2,21,5,777,11,111,4,62,0,9,0,1000
3,50,23,15,848,31,4,23,2,4,0,1000
4,4,6,128,35,730,2,91,3,1,0,1000
5,10,0,0,12,2,869,2,66,5,34,1000
6,191,14,133,45,97,5,498,4,13,0,1000
7,0,0,1,0,0,42,1,919,0,37,1000
8,15,0,29,22,13,27,24,12,850,8,1000
9,2,0,0,2,1,18,0,43,1,933,1000


In [46]:
accuracy_score(svm_clf_bagging_pred2, y_test)

0.8215

### 2. Non-Linear SVM 

#### Polynomial Kernel + Bagging 

In [47]:
poly_svm_clf = Pipeline([
                        ("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=1, random_state=2020)),
                        ])

In [48]:
poly_svm_bagging = BaggingClassifier(base_estimator=poly_svm_clf,
                                     n_estimators=n_estimators,
                                     max_samples=1./n_estimators,
                                     n_jobs=-1)

In [49]:
%%time
poly_svm_bagging.fit(X_train, y_train)

CPU times: user 301 ms, sys: 497 ms, total: 798 ms
Wall time: 22.8 s


BaggingClassifier(base_estimator=Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('svm_clf',
                                                  SVC(C=1, break_ties=False,
                                                      cache_size=200,
                                                      class_weight=None,
                                                      coef0=1,
                                                      decision_function_shape='ovr',
                                                      degree=3, gamma='scale',
                                                      kernel='poly',
                                                      max_iter=-1,
  

In [50]:
poly_svm_bagging_pred = poly_svm_bagging.predict(X_test)

In [51]:
# confusion matrix 
pd.crosstab(y_test, poly_svm_bagging_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,846,0,13,38,0,3,87,0,12,1,1000
1,5,973,4,15,0,0,3,0,0,0,1000
2,14,1,832,12,91,1,46,0,3,0,1000
3,17,9,12,921,22,0,18,0,1,0,1000
4,2,0,67,26,860,0,43,0,2,0,1000
5,2,0,0,1,0,920,1,51,2,23,1000
6,169,0,93,28,67,0,626,0,17,0,1000
7,0,0,0,0,0,25,0,933,0,42,1000
8,5,0,8,3,2,3,11,3,963,2,1000
9,0,0,0,0,0,16,0,43,0,941,1000


In [52]:
accuracy_score(poly_svm_bagging_pred, y_test)

0.8815

#### polynomial kernel + Bagging + control penalty parameter C 

In [65]:
poly_svm_clf2 = Pipeline([
                        ("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=10, random_state=2020)),
                        ])

In [66]:
poly_svm_bagging2 = BaggingClassifier(base_estimator=poly_svm_clf2,
                                     n_estimators=n_estimators,
                                     max_samples=1./n_estimators,
                                     n_jobs=-1)

In [67]:
%%time
poly_svm_bagging2.fit(X_train, y_train)

CPU times: user 296 ms, sys: 478 ms, total: 774 ms
Wall time: 23 s


BaggingClassifier(base_estimator=Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('svm_clf',
                                                  SVC(C=10, break_ties=False,
                                                      cache_size=200,
                                                      class_weight=None,
                                                      coef0=1,
                                                      decision_function_shape='ovr',
                                                      degree=3, gamma='scale',
                                                      kernel='poly',
                                                      max_iter=-1,
 

In [68]:
poly_svm_bagging_pred2 = poly_svm_bagging2.predict(X_test)

In [69]:
# confusion matrix 
pd.crosstab(y_test, poly_svm_bagging_pred2, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,880,2,13,21,1,2,69,0,12,0,1000
1,2,980,4,10,0,0,4,0,0,0,1000
2,18,0,851,10,72,1,46,0,2,0,1000
3,32,12,14,903,25,0,13,0,1,0,1000
4,3,1,81,31,838,0,44,0,2,0,1000
5,2,0,0,1,0,928,1,43,4,21,1000
6,181,0,103,22,62,0,621,0,11,0,1000
7,0,0,0,0,0,27,0,934,0,39,1000
8,4,0,9,4,4,3,10,2,964,0,1000
9,0,0,0,0,0,14,0,33,0,953,1000


In [70]:
accuracy_score(poly_svm_bagging_pred2, y_test)

0.8852

#### Gaussian RBF Kernel + Bagging 

In [73]:
# gamma를 다양하게 조정해 보았지만, 결과가 좋지 못했습니다. 
rbf_svm_clf = Pipeline([
                        ("scaler", StandardScaler()),
                        ("svm_clf", SVC(kernel="rbf", gamma=0.1, C=1)),
                        ])

In [74]:
gaus_svm_bagging = BaggingClassifier(base_estimator=rbf_svm_clf,
                                     n_estimators=n_estimators,
                                     max_samples=1./n_estimators,
                                     n_jobs=-1)

In [75]:
%%time 
gaus_svm_bagging.fit(X_train, y_train)

CPU times: user 575 ms, sys: 985 ms, total: 1.56 s
Wall time: 2min 39s


BaggingClassifier(base_estimator=Pipeline(memory=None,
                                          steps=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('svm_clf',
                                                  SVC(C=1, break_ties=False,
                                                      cache_size=200,
                                                      class_weight=None,
                                                      coef0=0.0,
                                                      decision_function_shape='ovr',
                                                      degree=3, gamma=0.1,
                                                      kernel='rbf', max_iter=-1,
                                                      proba

In [76]:
gaus_svm_bagging_pred = gaus_svm_bagging.predict(X_test)

In [77]:
# confusion matrix 
pd.crosstab(y_test, gaus_svm_bagging_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1000,0,0,0,0,0,0,0,0,1000
1,825,175,0,0,0,0,0,0,0,1000
2,997,0,2,0,0,0,0,1,0,1000
3,996,0,0,4,0,0,0,0,0,1000
4,993,0,0,0,1,0,0,6,0,1000
5,997,0,0,0,0,3,0,0,0,1000
6,958,0,0,0,0,0,42,0,0,1000
7,990,0,0,0,0,0,0,10,0,1000
8,999,0,0,0,0,0,0,0,1,1000
9,1000,0,0,0,0,0,0,0,0,1000


In [78]:
accuracy_score(gaus_svm_bagging_pred, y_test)

0.1238