<a href="https://colab.research.google.com/github/kexinz8/fashionMNIST/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

from sklearn import metrics

# Import data

In [None]:
# import data
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
(train_data, train_labels), (test_data, test_labels) = fashion_mnist.load_data()

In [None]:
train_x = train_data.reshape(train_data.shape[0], -1).astype('float32')
test_x = test_data.reshape(test_data.shape[0], -1).astype('float32')

# Multiclassification
## LightGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
# dont run this part, i'm still tuning
err_list = []
for i in np.arange(0.0, 1.0, 0.1):
    model = LGBMClassifier(objective='multiclass',path_smooth = i)
    model.fit(train_x, train_labels, categorical_feature=[0,3])
    predictions = model.predict(test_x)
    error = sum(predictions!=test_x)/len(test_labels)
    err_list.append(error)
err_list

In [None]:
lgb_model = LGBMClassifier(objective='multiclass',path_smooth = 0.5)
lgb_model.fit(train_x,train_labels,categorical_feature=[0,3])

In [None]:
expected_y  = test_labels
predicted_y = lgb_model.predict(test_x)
lightGBM_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

In [None]:
# plot
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(test_labels, lda_pred_y, cmap='Blues')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
tune = np.arange(0.0, 1.0, 0.1)
error = np.array(err_list)

plt.style.use("fivethirtyeight")
plt.plot(tune, error)

plt.xlabel("Misclassification Error Rate")
plt.ylabel("Path Smooth")

plt.show()

## LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
lda = LDA(n_components=9)

In [None]:
expected_y  = test_labels
lda_model = lda.fit(train_x, train_labels)
predicted_y = lda_model.predict(test_x)
lda_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

In [None]:
report = metrics.classification_report(expected_y, predicted_y, output_dict=True)
pd.DataFrame(report).transpose()

In [None]:
# plot
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(test_labels, lda_pred_y, cmap='RdPu')

## SVM

In [None]:
from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
# Use Ploynomial kernel 
svmpoly = svm.SVC(kernel='poly', degree=3, C=1).fit(train_x, train_labels)

In [None]:
expected_y  = test_labels
predicted_y = svmpoly.predict(test_x)
SVM_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

In [None]:
# evaluation table
report = metrics.classification_report(expected_y, predicted_y, output_dict=True)
pd.DataFrame(report).transpose()

In [None]:
# plot
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(test_labels, lda_pred_y, cmap='Greens')

Combine the three models results

In [None]:
# to get the precision dictionary
out = {}
for i in range(0,10):
    out.update({i:round(report[str(i)]['precision'],2)})

In [None]:
# create df of y_pred of test set for each model
y_pred_test = pd.concat([pd.DataFrame(lightGBM_pred_y, columns = ['LightGBM']),
           pd.DataFrame(lda_pred_y, columns = ['LDA']), 
           pd.DataFrame(SVM_pred_y, columns = ['SVM'])],axis=1)

In [None]:
y_pred_test.to_csv('y_pred_test.csv')

# Ensemble


In [None]:
import lightgbm as ltg
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier

In [None]:
estimator = []
estimator.append(('LDA', LDA(n_components=9)))
estimator.append(('SVC', svm.SVC(kernel='poly', degree=3, C=1)))
estimator.append(('LGBM', ltg.LGBMClassifier(objective='multiclass',path_smooth = 0.2)))

## Voting

In [None]:
vot_hard=VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(train_x, train_labels)

In [None]:
# predict with voting-ensembled model and check accuracy on testing set
ypred=vot_hard.predict(test_x)
accuracy_score(test_labels, ypred)

In [None]:
#plot# plot
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(test_labels, lda_pred_y, cmap='magma')

## Voting++

In [34]:
# Construct the reference table
ref_LTG = {0: 0.84, 1: 1.0, 2: 0.84, 3: 0.91, 4: 0.85, 5: 0.99, 6: 0.75, 7: 0.95, 8: 0.98, 9: 0.96}
ref_LDA = {0: 0.80, 1: 1.0, 2: 0.76, 3: 0.82, 4: 0.75, 5: 0.86, 6: 0.59, 7: 0.87, 8: 0.94, 9: 0.90}
ref_SVM = {0: 0.80, 1: 0.99, 2: 0.83, 3: 0.90, 4: 0.85, 5: 0.82, 6: 0.71, 7: 0.92, 8: 0.98,9: 0.96}
ref_df = [ref_LTG, ref_LDA, ref_SVM]

In [38]:
ypred_org = y_pred_test
ypred = ypred_org.copy()

In [39]:
# represent the y_test_pred with according precision
for i in range(ypred.shape[1]):
   ypred.iloc[:,i] = ypred.iloc[:,i].replace(ref_df[i])

ypred.columns = [1,2,3]
ypred

Unnamed: 0,1,2,3
0,0.97,0.92,0.96
1,0.80,0.71,0.80
2,1.00,1.00,0.99
3,1.00,1.00,0.99
4,0.72,0.56,0.66
...,...,...,...
9995,0.97,0.92,0.96
9996,1.00,1.00,0.99
9997,0.98,0.56,0.97
9998,1.00,1.00,0.99


In [40]:
# For each sample, take the result from model with highest precision 
n=ypred.idxmax(axis='columns')
result=[]
for j in range(len(n)):
    m=n[j]-1
    result.append(ypred_org.iloc[j,m])

In [41]:
# Calculate the precision for udf model
udf_report=metrics.classification_report(test_labels, result, output_dict=True)
pd.DataFrame(udf_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.79375,0.889,0.838679,1000.0
1,0.989837,0.974,0.981855,1000.0
2,0.80426,0.793,0.79859,1000.0
3,0.846435,0.926,0.884432,1000.0
4,0.78658,0.844,0.814279,1000.0
5,0.951866,0.969,0.960357,1000.0
6,0.824615,0.536,0.649697,1000.0
7,0.949704,0.963,0.956306,1000.0
8,0.930806,0.982,0.955718,1000.0
9,0.95825,0.964,0.961117,1000.0


In [None]:
# plot
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(test_labels, lda_pred_y, cmap='Greys')

# Stacking

In [None]:
clf = StackingClassifier(estimators=estimator, final_estimator=LogisticRegression(max_iter=200, solver='liblinear'))
clf.fit(train_x, train_labels).score(test_x, test_labels)