In [20]:
import matplotlib.pyplot as plt 
import os
import sys
import cv2
import numpy as np 
import numba
import gzip 
from sklearn import datasets, svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from skimage import exposure
import pickle


In [21]:
"""----------------------load data function----------------------"""

def load_mnist(path, kind = "train"): 
    labels_path = os.path.join(path, "%s-labels-idx1-ubyte.gz" % kind)
    images_path = os.path.join(path, "%s-images-idx3-ubyte.gz" % kind)

    with gzip.open(labels_path, "rb") as lbpath: 
        lbpath.read(8) 
        buffer = lbpath.read() 
        labels = np.frombuffer(buffer, dtype=np.uint8) 
    with gzip.open(images_path, "rb") as imgpath: 
        imgpath.read(16) 
        buffer = imgpath.read() 
        images = np.frombuffer(buffer, dtype=np.uint8).reshape(len(labels), 28, 28).astype(np.float64)
    
    return images, labels 


In [22]:
"""----------------------vectorize----------------------"""

def vectorize(arr): 
    temp = [[] for i in range(arr.shape[0])]

    for i in range(arr.shape[0]): 
        temp[i] = arr[i].flatten() 

    return np.asarray(temp)
    

In [23]:
"""----------------------downsampling----------------------"""

def downsampling(arr, shape): 
    temp = [[] for i in range(arr.shape[0])]

    for i in range(arr.shape[0]): 
        temp[i] = cv2.resize(arr[i].astype(np.uint8), shape)

    return np.asarray(temp)

In [24]:
"""----------------------histogram----------------------"""

def histogram(arr): 
  temp = [[] for i in range(arr.shape[0])]

  for i in range(arr.shape[0]):
    temp[i] = (cv2.calcHist([arr[i].astype(np.uint8)], [0], None, [256], [0, 256])).flatten()

  return np.asarray(temp)
  

In [25]:
"""-------------------KNN Classifier (sklearn)-------------------------"""

def KNN(x_test, y_test, x_train, y_train, k, filename, feature_extraction, shape = -1):  
    """feature extraction"""
    if (shape == -1): 
        x_test = feature_extraction(x_test) 
        x_train = feature_extraction(x_train)
    else: 
        x_test = feature_extraction(x_test, shape) 
        x_train = feature_extraction(x_train, shape)

    """flatten data"""
    x_test = vectorize(x_test) 
    x_train = vectorize(x_train)

    """build model"""
    model = KNeighborsClassifier(n_neighbors = k) 
    model.fit(x_train, y_train)
    
    """save model"""
    pickle.dump(model, open(filename, 'wb'))

    """predict label"""     
    y_pred = model.predict(x_test)  
    
    print("k = %d, accuracy = %.2f\n" % (k, accuracy_score(y_pred, y_test) * 100)) 
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), end = "\n\n")
    print("EVALUATION ON TESTING DATA\n", classification_report(y_test, y_pred, digits = 4))

    return y_pred

In [26]:
"""-------------------Sample Mean Classifier (sklearn, 1NN)-------------------------"""

"""calculate sample mean"""
def SM_build(x_train, y_train): 
    cnt_y = np.bincount(y_train) 

    """find shape of SM data"""
    shape = [cnt_y.size]
    for x in x_train[0].shape: 
        shape.append(x) 
    shape = tuple(shape)

    SM_arr = np.zeros(shape)

    """sum"""
    for i in range(x_train.shape[0]): 
        SM_arr[y_train[i]] += x_train[i] 

    """average"""
    for i in range(cnt_y.size): 
        SM_arr[i] /= cnt_y[i] 

    return SM_arr, np.arange(cnt_y.size)

def SM(x_test, y_test, x_train, y_train, filename, feature_extraction, shape = -1):
    """feature extraction"""
    if (shape == -1): 
        x_test = feature_extraction(x_test) 
        x_train = feature_extraction(x_train)
    else: 
        x_test = feature_extraction(x_test, shape) 
        x_train = feature_extraction(x_train, shape)

    """sample mean"""
    x_train, y_train = SM_build(x_train, y_train) 

    """flatten data"""
    x_test = vectorize(x_test) 
    x_train = vectorize(x_train)

    """build model"""     
    model = KNeighborsClassifier(n_neighbors = 1) 
    model.fit(x_train, y_train)

    """save model"""
    pickle.dump(model, open(filename, 'wb'))

    """predict label"""     
    y_pred = model.predict(x_test)  
    
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), end = "\n\n")
    print("EVALUATION ON TESTING DATA\n", classification_report(y_test, y_pred, digits = 4))

    return y_pred

In [27]:
"""----------------------load data----------------------"""

x_test, y_test = load_mnist("data/", kind = "test"); 
x_train, y_train = load_mnist("data/", kind = "train") 


In [28]:
"""----------------------test KNN----------------------""" 

print("FEATURE EXTRACTION: VECTORIZE")
y_pred_vtr = KNN(x_test, y_test, x_train, y_train, 179, "knn_model_vectorize.pkl", vectorize)
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: DOWNSAMPLING 7x7")
y_pred_down7 = KNN(x_test, y_test, x_train, y_train, 179, "knn_model_downsampling7.pkl", downsampling, (7, 7))
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: DOWNSAMPLING 14x14")
y_pred_down14 = KNN(x_test, y_test, x_train, y_train, 179, "knn_model_downsampling14.pkl", downsampling, (14, 14))
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: HISTORGRAM")
y_pred_htg = KNN(x_test, y_test, x_train, y_train, 179, "knn_model_histogram.pkl", histogram)


FEATURE EXTRACTION: VECTORIZE
k = 179, accuracy = 93.17

Confusion Matrix:
 [[ 967    1    0    0    0    3    8    1    0    0]
 [   0 1131    2    1    0    0    1    0    0    0]
 [  23   63  885    7    3    2    8   30   11    0]
 [   0   11    3  957    1   11    0   13    8    6]
 [   1   26    0    0  893    1   10    3    1   47]
 [   5   17    0   20    2  815   15    2    0   16]
 [  10    9    0    0    4    2  933    0    0    0]
 [   0   58    2    0    3    0    0  938    0   27]
 [  12   18    2   25   12   19    5   10  849   22]
 [  10   11    2    8    9    3    1   16    0  949]]

EVALUATION ON TESTING DATA
               precision    recall  f1-score   support

           0     0.9407    0.9867    0.9631       980
           1     0.8409    0.9965    0.9121      1135
           2     0.9877    0.8576    0.9180      1032
           3     0.9401    0.9475    0.9438      1010
           4     0.9633    0.9094    0.9356       982
           5     0.9521    0.9137    0.

In [29]:
"""----------------------test Sample Mean Classifier----------------------""" 

print("FEATURE EXTRACTION: VECTORIZE")
sm_y_pred_vtr = SM(x_test, y_test, x_train, y_train, "sm_model_vectorize.pkl", vectorize)
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: DOWNSAMPLING 7x7")
sm_y_pred_down7 = SM(x_test, y_test, x_train, y_train, "sm_model_downsampling7.pkl", downsampling, (7, 7))
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: DOWNSAMPLING 14x14")
sm_y_pred_down14 = SM(x_test, y_test, x_train, y_train, "sm_model_downsampling14.pkl", downsampling, (14, 14))
print("------------------------------------------------------------\n")

print("FEATURE EXTRACTION: HISTORGRAM")
sm_y_pred_htg = SM(x_test, y_test, x_train, y_train, "sm_model_histogram.pkl", histogram)


FEATURE EXTRACTION: VECTORIZE
Confusion Matrix:
 [[ 878    0    7    2    2   58   25    1    7    0]
 [   0 1092   10    3    0    7    3    0   20    0]
 [  19   71  781   33   31    3   23   18   50    3]
 [   4   24   25  814    1   49    8   15   58   12]
 [   1   22    2    0  811    3   16    1   10  116]
 [  11   63    2  118   21  612   27   10   13   15]
 [  18   27   22    0   31   32  827    0    1    0]
 [   2   59   22    1   20    2    0  856   13   53]
 [  14   39   11   83   12   36   13   10  718   38]
 [  15   22    7   10   83   12    1   27   18  814]]

EVALUATION ON TESTING DATA
               precision    recall  f1-score   support

           0     0.9127    0.8959    0.9042       980
           1     0.7696    0.9621    0.8551      1135
           2     0.8785    0.7568    0.8131      1032
           3     0.7650    0.8059    0.7850      1010
           4     0.8014    0.8259    0.8134       982
           5     0.7518    0.6861    0.7175       892
           6