# KMINST Classifier using PCA and three different classifiers

We decided to try classifying the KMNIST dataset using some unconventional methods. 

Firstly, the dimensionality of the set (28x28 = 784) was reduced using PCA. Then, we classified the data using three different classifiers:
- KNN Classifier
- Random Forrest Classifier
- Naive-Bayesian Classifier

These were simpler to implement than the neursl networks however, (as we will see), they give a lower accuracy score on the  validation set than AlexNet or LeNet. Therefore non of these classifiers were submitted on Kaggle.

##1. Functions necessary to use methods in Utils.ipynb

In [0]:
import io, os, sys, types
import nbformat

from IPython import get_ipython
from IPython.core.interactiveshell import InteractiveShell

In [0]:
def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path
    
    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path
            
class NotebookLoader(object):
    """Module Loader for IPython Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path
    
    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)
        
        print ("importing notebook from %s" % path)
                                       
        # load the notebook object
        nb = nbformat.read(path, as_version=4)
        
        
        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod
        
        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__
        
        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod

class NotebookFinder(object):
    """Module finder that locates IPython Notebooks"""
    def __init__(self):
        self.loaders = {}
    
    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return
        
        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)
        
        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
      

sys.meta_path.append(NotebookFinder())

### Mount Google drive

In [17]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


### Show that file exists in Google path
**Note**:  You will need to ensure that you have a folder named "**KMNIST_ENTROPY**" in your Google Drive, that contains  Utils.ipynb (and  \__init__.py)

In [18]:
ls gdrive/My\ Drive/KMNIST_ENTROPY

AlexNet.ipynb  Model_Averaging.ipynb  [0m[01;34m__pycache__[0m/
__init__.py    [01;34mmodels[0m/                [01;34mresults[0m/
LeNet.ipynb    PCA_Classifier.ipynb   Utils.ipynb


### Append the system path and import Utils.ipynb

In [0]:
sys.path.append('gdrive/My Drive/')

In [0]:
from KMNIST_ENTROPY.Utils import *

# If this cell gives a "No Module Found Error", please restart the runtime of the collab notebook 

## 2. PCA reduction with K-Neighbour classifier

In [0]:
def pca_kneighbours(trainset, tr_labels, testset, te_labels=None, n_component=20, neigh_num=4, evaluate=False):
    """ 
    This function performs PCA reduction with the KNN neighbours classifier

    Input: trainset - training set, a torch.Tensor
           tr_labels -  training labels, a torch.Tensor
           testset - test/validation set, a torch.Tensor
           te_labels -  test labels, could be None if it is with a test set instead 
                      of validation set, a torch.Tensor
           n_components - number of components to reduce to for PCA, integer
           neigh_num - number of neighbours for KNN classifier, integer
           evaluate - if true, then we are testing on the testset instead of the 
                      validation set, boolean 
                      Note: this functionality is only added to the KNN function 
                      since this performs better on the validation test
           
    Return: accuracy: test/validation accuracy, float
            loss: test/validation loss, float
            y_pred: the prediction labels of our model
    """
    
    
    train_pca = np.array(trainset)
    test_pca = np.array(testset)

    pca = PCA(n_components=n_component, svd_solver='randomized').fit(train_pca)
    
    
    X_train = pca.transform(train_pca)
    X_test = pca.transform(test_pca)

    clf = KNeighborsClassifier(neigh_num)
    clf = clf.fit(X_train, tr_labels)
    
    y_pred = clf.predict(X_test[:len(testset)])
    
    loss = 0.0 
    accuracy = 0.0

    if not evaluate:
      # Checking the output
      true = 0
      false = 0
      for i in range(len(testset)):
          if y_pred[i] == te_labels[i]:
              true+=1
          else:
              false+=1

      loss = float(false)*100/(true+false)
      accuracy = float(true)*100/(true+false)

    return accuracy, loss, y_pred

## 3. PCA reduction with Randomforest classifier

In [0]:
def pca_randomforest(trainset, tr_labels, testset, te_labels, n_component=20, n_estimator=100, max_depth=2):
    """ 
    This function performs PCA reduction with the Random Forrest classifier

    Input: trainset - training set, a torch.Tensor
           tr_labels -  training labels, a torch.Tensor
           testset - test/validation set, a torch.Tensor
           te_labels -  test labels, a torch.Tensor
           n_components - number of components to reduce to for PCA, integer
           n_estimators - number of estimators for Random Forrest Classification, integer
           max_depth - Maxinmum depth for Random Forrest Classification, integer 
           
    Return: accuracy: test/validation accuracy, float
            loss: test/validation loss, float
            y_pred: the prediction labels of our model
    """
    
    train_pca = np.array(trainset)
    test_pca = np.array(testset)

    pca = PCA(n_components=n_component, svd_solver='randomized').fit(train_pca)
    
    
    X_train = pca.transform(train_pca)
    X_test = pca.transform(test_pca)
    
    clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=0)
    clf.fit(X_train, tr_labels)
    
    y_pred = clf.predict(X_test[:len(testset)])

    # Checking the output
    true = 0
    false = 0
    for i in range(len(testset)):
        if y_pred[i] == te_labels[i]:
            true+=1
        else:
            false+=1
    
    loss = float(false)*100/(true+false)
    accuracy = float(true)*100/(true+false)

    return accuracy, loss, y_pred
  

## 4. PCA reduction with Naive-Bayes classifier

In [0]:
def pca_NBGauss(trainset, tr_labels, testset, te_labels, n_component=20):
    """ 
    This function performs PCA reduction with the Naive-Bayesian classifier

    Input: trainset - training set, a torch.Tensor
           tr_labels -  training labels, a torch.Tensor
           testset - test/validation set, a torch.Tensor
           te_labels -  test labels, a torch.Tensor
           n_components - number of components to reduce to for PCA, integer
           
    Return: accuracy: test/validation accuracy, float
            loss: test/validation loss, float
            y_pred: the prediction labels of our model
    """

    
    train_pca = np.array(trainset)
    test_pca = np.array(testset)

    pca = PCA(n_components=n_component, svd_solver='randomized').fit(train_pca)
    
    
    X_train = pca.transform(train_pca)
    X_test = pca.transform(test_pca)
    
    gnb = GaussianNB()
    gnb.fit(X_train, tr_labels)
    
    y_pred = gnb.predict(X_test[:len(testset)])


    # Checking the output
    true = 0
    false = 0
    for i in range(len(testset)):
        if y_pred[i] == te_labels[i]:
            true+=1
        else:
            false+=1

    loss = float(false)*100/(true+false)
    accuracy = float(true)*100/(true+false)

    return accuracy, loss, y_pred

## 5. Classify validation set

In [0]:
def test_pca_classifier(classifier=0, neigh_num=4, n_components=20, k_folds=1):
    """ 
    This function performs PCA reduction with the classifier defined by the user:
    clasifier = 0 : KNN 
    clasifier = 1 : randomforest 
    clasifier = 2 -- Naive-Bayes(Gaussian)

    Input: classifier - type of classifier, integer 0, 1, 2
           neigh_num - number of neighbours for KNN classifier, integer
           n_components - number of components to reduce to for PCA, integer
           k_folds - number of to perform the K-fold cross validation on, integer > 0
           
    Return: acc - mean of validation accuracy scores, float 
            loss - mean of validation accuracy scores, float 
            y_pred - predictions of the classifiers
    """


    # For each training set - get a value for the validation set 
    trains, valids, tr_labels, val_labels = k_split(kmnist_data, kmnist_labels, splits=k_folds)

    
    if k_folds==1:
      train_mean, val_mean, train_std, val_std = get_mean_std(trains, valids)

      k_train = CustomImageTensorDataset(trains, tr_labels, transform=transformed(train_mean, train_std, choice=1))
      k_validate = CustomImageTensorDataset(valids, val_labels, transform=transformed(val_mean, val_std, choice=1))

      # View this in 2d for pca 
      x_tr = k_train.data.view(len(k_train.data), 28*28)
      x_te = k_validate.data.view(len(k_validate.data), 28*28)
      
      ac, loss, y_pred = None, None, None

      if (classifier == 0):
        ac, loss, y_pred = pca_kneighbours(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components, neigh_num=neigh_num); # KNN

      if (classifier == 1):
        ac, loss, y_pred = pca_randomforest(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components, n_estimator=100, max_depth=25); # RF

      if (classifier == 2):
        ac, loss, y_pred = pca_NBGauss(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components); # NB
        

      # Final validation accuracy is the average of all the validation sets  
      return ac, loss, y_pred

    else:
      
      # Save the list of accuracy score
      acs = [] 
      losses = []
      
      # Save max accuracy and corresponding y_pred 
      max_acc = 0.0
      max_y_pred = None

      for i in range(k_folds):
        # Get the mean and std of training and validation set
        train_mean, val_mean, train_std, val_std = get_mean_std(trains[i], valids[i])

        k_train = CustomImageTensorDataset(trains[i], tr_labels[i], transform=transformed(train_mean, train_std, choice=1))
        k_validate = CustomImageTensorDataset(valids[i], val_labels[i], transform=transformed(val_mean, val_std, choice=1))

        # View this in 2d for pca 
        x_tr = k_train.data.view(len(k_train.data), 28*28)
        x_te = k_validate.data.view(len(k_validate.data), 28*28)
        
        ac, loss, y_pred = None, None, None

        if (classifier == 0):
          ac, loss, y_pred = pca_kneighbours(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components, neigh_num=neigh_num)

        if (classifier == 1):
          ac, loss, y_pred = pca_randomforest(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components, n_estimator=100, max_depth=30)

        if (classifier == 2):
          ac, loss, y_pred = pca_NBGauss(x_tr, k_train.targets, x_te, k_validate.targets, n_component=n_components)

        acs.append(ac)
        losses.append(loss)
        
        if ac > max_acc:
          max_acc = ac
          max_y_pred = y_pred
          
        # Final validation accuracy is the average of all the validation sets  
        # Alse return prediction with highest accuracy
        return np.asarray(acs).mean(), np.asarray(losses).mean(), max_y_pred


### Run classifying function

In [11]:
print("PCA with KNN Classifier")
res_knn_acc, res_knn_loss, res_knn_preds = test_pca_classifier(classifier=0, neigh_num=4, n_components=80, k_folds=1)
print("Accuracy, Loss: ", res_knn_acc, res_knn_loss)

print("PCA with Random Forrest Classifier")
res_rf_acc, res_rf_loss, res_rf_preds = test_pca_classifier(classifier=1, n_components=40, k_folds=1)
print("Accuracy, Loss: ", res_rf_acc, res_rf_loss)

print("PCA with Naive-Bayes Classifier")
res_nb_acc, res_nb_loss, res_nb_preds = test_pca_classifier(classifier=2, n_components=40, k_folds=1)
print("Accuracy, Loss: ", res_nb_acc, res_nb_loss)

PCA with KNN Classifier
Accuracy, Loss:  97.66666666666667 2.3333333333333335
PCA with Random Forrest Classifier
Accuracy, Loss:  93.51666666666667 6.483333333333333
PCA with Naive-Bayes Classifier
Accuracy, Loss:  76.56666666666666 23.433333333333334


## 6. Predict the test set

In [0]:
# View this in 2d for pca 
x_tr = kmnist_data.data.view(len(kmnist_data), 28*28)
x_te = kmnist_test.data.view(len(kmnist_test), 28*28)

_, _, y_preds = pca_kneighbours(x_tr, kmnist_labels, x_te, n_component=80, neigh_num=4, evaluate=True)

## 7. Save  predictions to csv file

In [0]:
save_predictions_ns(y_preds, name="pca_classifier_preds")