# Fashion Mnist

In [1]:
%matplotlib inline 

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
from graphviz import Source 
from sklearn.tree import export_graphviz

In [4]:
from sklearn.ensemble import VotingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## 최종 score 

```python
# hard voting 방법을 사용해 ensemble 한 모델이 가장 좋은 성능을 나타냅니다. 
hard_voting = VotingClassifier(estimators=[('dt', tree_clf), ('rf', rf_clf), ('et', et_clf)], voting="hard")
```

In [39]:
# confusion matrix 
pd.crosstab(y_test, hard_voting_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,870,1,12,31,1,1,74,0,10,0,1000
1,3,971,7,16,1,1,1,0,0,0,1000
2,10,2,810,15,104,0,52,0,7,0,1000
3,21,5,9,937,17,0,11,0,0,0,1000
4,2,2,63,29,859,0,42,0,3,0,1000
5,0,0,0,0,1,945,0,38,5,11,1000
6,178,1,112,29,61,0,604,0,15,0,1000
7,1,0,0,0,0,14,0,935,0,50,1000
8,3,1,8,0,4,1,6,2,975,0,1000
9,1,0,0,0,0,7,0,40,2,950,1000


In [40]:
# accuracy score 
accuracy_score(hard_voting_pred, y_test)

0.8856

## Data Load 

In [10]:
train = pd.read_csv("fashion-mnist_train.csv")
test  = pd.read_csv("fashion-mnist_test.csv")

In [11]:
train.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_train = train.drop('label', axis=1)
X_test  = test.drop('label', axis=1)

y_train = train.label 
y_test  = test.label

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60000, 784), (10000, 784), (60000,), (10000,))

## 1. Tree Models

### 1. DecisionTree

In [14]:
tree_clf = Pipeline([
                    ("scaler", StandardScaler()),
                    ("tree", DecisionTreeClassifier(max_features="auto", random_state=42))
                    ])

In [15]:
tree_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=42,
                                        splitter='best'))],
         verbose=False)

In [16]:
tree_clf_pred = tree_clf.predict(X_test)

In [17]:
# confusion matrix 
pd.crosstab(y_test, tree_clf_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,727,8,27,41,15,2,166,1,13,0,1000
1,7,948,9,21,6,1,4,0,3,1,1000
2,23,5,689,16,134,0,120,0,12,1,1000
3,59,46,20,786,42,2,37,0,8,0,1000
4,15,4,131,58,669,3,113,0,6,1,1000
5,3,2,0,6,2,884,0,64,15,24,1000
6,180,7,122,47,104,2,523,0,15,0,1000
7,1,0,0,0,0,62,0,866,3,68,1000
8,13,1,8,4,9,16,31,9,903,6,1000
9,2,1,0,0,0,19,2,76,9,891,1000


In [18]:
accuracy_score(tree_clf_pred, y_test)

0.7886

### 2. RandomForest 

In [19]:
rf_clf = Pipeline([
                    ("scaler", StandardScaler()),
                    ("tree", RandomForestClassifier(random_state=42))
                    ])

In [20]:
%time 
rf_clf.fit(X_train, y_train)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.25 µs


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('tree',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [21]:
rf_clf_pred = rf_clf.predict(X_test)

In [22]:
# confusion matrix 
pd.crosstab(y_test, rf_clf_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,860,0,12,31,4,1,79,0,13,0,1000
1,2,971,5,17,1,1,3,0,0,0,1000
2,8,1,801,13,113,0,56,0,8,0,1000
3,17,6,8,937,18,0,14,0,0,0,1000
4,1,1,64,26,863,0,42,0,3,0,1000
5,0,0,0,0,0,947,0,37,5,11,1000
6,166,1,99,28,75,0,612,0,19,0,1000
7,0,0,0,0,0,13,0,937,0,50,1000
8,1,1,7,0,3,1,10,2,975,0,1000
9,0,0,0,0,0,7,1,41,2,949,1000


In [23]:
accuracy_score(rf_clf_pred, y_test)

0.8852

### 3. ExtraTree

In [24]:
et_clf = Pipeline([
                    ("scaler", StandardScaler()),
                    ("tree", ExtraTreesClassifier(random_state=42))
                    ])

In [25]:
%time 
et_clf.fit(X_train, y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('tree',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=None, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=100, n_jobs=None,
                                      oob_score=False, random_state=42,
                                      verbose=0, warm_start=False))],
         verbose=False)

In [26]:
et_clf_pred = et_clf.predict(X_test)

In [27]:
# confusion matrix 
pd.crosstab(y_test, et_clf_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,858,1,13,32,0,2,84,0,10,0,1000
1,2,969,8,16,1,1,3,0,0,0,1000
2,9,1,816,16,103,0,47,0,8,0,1000
3,16,4,8,931,25,0,15,0,1,0,1000
4,1,0,61,31,853,0,51,0,3,0,1000
5,0,0,0,0,0,940,0,42,5,13,1000
6,175,1,109,30,65,0,604,0,16,0,1000
7,0,0,0,0,0,13,0,936,0,51,1000
8,1,1,9,1,5,2,4,2,975,0,1000
9,0,0,0,0,0,9,0,39,1,951,1000


In [28]:
accuracy_score(et_clf_pred, y_test)

0.8833

| Model | Accuracy | 
| :-: | :-: | 
| DecisionTree | 0.7886 | 
| RandomForest | 0.8852 | 
| ExtraTree | 0.8833 | 

* 성능 : RandomForest > ExtraTree > DecisionTree 

## 2. Ensemble

### 1. Hard Voting 

In [29]:
hard_voting = VotingClassifier(estimators=[('dt', tree_clf), ('rf', rf_clf), ('et', et_clf)], voting="hard")

In [30]:
%time
hard_voting.fit(X_train, y_train)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


VotingClassifier(estimators=[('dt',
                              Pipeline(memory=None,
                                       steps=[('scaler',
                                               StandardScaler(copy=True,
                                                              with_mean=True,
                                                              with_std=True)),
                                              ('',
                                               DecisionTreeClassifier(ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=None,
                                                                      max_features='auto',
                                                                      max_leaf_nodes=None,
                                            

In [31]:
hard_voting_pred = hard_voting.predict(X_test)

In [32]:
# confusion matrix 
pd.crosstab(y_test, hard_voting_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,870,1,12,31,1,1,74,0,10,0,1000
1,3,971,7,16,1,1,1,0,0,0,1000
2,10,2,810,15,104,0,52,0,7,0,1000
3,21,5,9,937,17,0,11,0,0,0,1000
4,2,2,63,29,859,0,42,0,3,0,1000
5,0,0,0,0,1,945,0,38,5,11,1000
6,178,1,112,29,61,0,604,0,15,0,1000
7,1,0,0,0,0,14,0,935,0,50,1000
8,3,1,8,0,4,1,6,2,975,0,1000
9,1,0,0,0,0,7,0,40,2,950,1000


In [33]:
accuracy_score(hard_voting_pred, y_test)

0.8856

* 가장 좋은 성능을 보인 RandomForest (0.8852) 보다 성능이 조금 향상되었습니다. 

### 2. Soft Voting 

In [34]:
soft_voting = VotingClassifier(estimators=[('dt', tree_clf), ('rf', rf_clf), ('et', et_clf)], voting="soft")

In [35]:
%time
soft_voting.fit(X_train, y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


VotingClassifier(estimators=[('dt',
                              Pipeline(memory=None,
                                       steps=[('scaler',
                                               StandardScaler(copy=True,
                                                              with_mean=True,
                                                              with_std=True)),
                                              ('',
                                               DecisionTreeClassifier(ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=None,
                                                                      max_features='auto',
                                                                      max_leaf_nodes=None,
                                            

In [36]:
soft_voting_pred = soft_voting.predict(X_test)

In [37]:
# confusion matrix 
pd.crosstab(y_test, soft_voting_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,3,4,5,6,7,8,9,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,787,6,24,33,14,1,122,1,12,0,1000
1,4,961,8,17,2,1,4,0,2,1,1000
2,21,3,732,13,121,0,99,0,10,1,1000
3,35,19,12,880,25,2,23,0,4,0,1000
4,9,2,101,43,755,3,84,0,3,0,1000
5,2,1,0,1,2,920,0,52,8,14,1000
6,179,7,112,42,93,2,551,0,14,0,1000
7,1,0,0,0,0,34,0,909,1,55,1000
8,6,1,6,3,2,8,15,2,955,2,1000
9,2,0,0,0,0,10,0,48,3,937,1000


In [38]:
accuracy_score(soft_voting_pred, y_test)

0.8387

* 간접 투표 방식보다는, 단일 모델을 사용하는 것이 더 좋은 성능을 보이고 있습니다. 