# COMP5318: Assignment 1
## By Jesse S. Narvasa (jnar3156)

In [1]:
# Library imports

import h5py
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st
from datetime import datetime

In [2]:
# Loading our training data

with h5py.File('./Input/train/images_training.h5','r') as H:
    data_train = np.copy(H['datatrain'])
with h5py.File('./Input/train/labels_training.h5','r') as H:
    label_train = np.copy(H['labeltrain'])
    
# Loading our testing data

with h5py.File('./Input/test/images_testing.h5','r') as H:
    data_test = np.copy(H['datatest'])
with h5py.File('./Input/test/labels_testing_2000.h5','r') as H:
    label_test = np.copy(H['labeltest'])

In [3]:
# Verifying our loaded training data
print(data_train.shape, label_train.shape)

# Verifying our loaded testing data
print(data_test.shape, label_test.shape)

(30000, 784) (30000,)
(5000, 784) (2000,)


In [4]:
# Class mappings
class_mappings = {
    0: 'T-shirt/Top',
    1: 'Trouser',
    2: 'Pullover',
    3: 'Dress',
    4: 'Coat',
    5: 'Sandal',
    6: 'Shirt',
    7: 'Sneaker',
    8: 'Bag',
    9: 'Ankle boot'
}

In [None]:
current_image = 1

In [None]:
# Testing loading an image

data_train = data_train.reshape((data_train.shape[0], 28, 28))
plt.imshow(data_train[current_image], cmap=plt.get_cmap('gray'))
plt.title("class " + str(label_train[current_image]) + ": " + class_mappings[label_train[current_image]] )
plt.show()

### Compression

#### SVD

In [None]:
# Run SVD on it
target_data = data_train[current_image]
U, s, Vt = np.linalg.svd(target_data, full_matrices=False)
S = np.diag(s)

# We then create a dynamic component_num in which we'll use to determine how many components we'll keep
# this is determine by round to 2 decimal places, and any value that is 0 will be removed
svd_components = np.count_nonzero(np.round(s, decimals=2) > 0)

target_data_reconstructed = U[:, :svd_components]\
    .dot(S[:svd_components, :svd_components])\
    .dot(Vt[:svd_components, :])

plt.imshow(target_data_reconstructed, cmap=plt.get_cmap('gray'))
plt.title(f"After SVD using only the top {svd_components} elements")
plt.show()

# Verifying that our reconstructed matrix is approximately equal to our original matrix (within tolerance)
print(f'Is the original and reconstructed matrix approximately equal? {np.allclose(target_data, target_data_reconstructed)}')

# Checking the compression ratio of this
comp_ratio = ((U.shape[0] * svd_components) + (svd_components) + (svd_components * Vt.shape[1])) / (target_data.shape[0]*target_data.shape[1])
print(f"Our compression ratio is: {comp_ratio}")

# SSE
svd_SSE = np.sum((target_data - target_data_reconstructed)**2)
print(f"SVD SSE is: {svd_SSE}")

#### PCA

In [None]:
# Applying mean-centering to the original dataset
target_data_mean = target_data.mean(axis=0)
target_data_mean_matrix = np.full((len(target_data_mean), len(target_data_mean)), target_data_mean)
target_data_centred = target_data - target_data_mean_matrix

# Now let's try PCA
XtX = (target_data_centred.T).dot(target_data_centred)
l, V = np.linalg.eig(XtX)
L = np.diag(l)

pca_components = np.count_nonzero(np.round(l, decimals=2) > 0)

pca_projection = target_data_centred.dot(V)
target_data_reconstructed = pca_projection[:, :pca_components]\
    .dot(V.T[:pca_components, :])\
    + target_data_mean_matrix

plt.imshow(target_data_reconstructed, cmap=plt.get_cmap('gray'))
plt.title(f"After PCA, using only {pca_components} components")
plt.show()

# SSE
pca_SSE = np.sum((target_data - target_data_reconstructed)**2)
print(f"PCA SSE is: {pca_SSE}")

### Pre-processing

#### PCA

In [5]:
def apply_pca(data_train, data_test, n_components=20):
    '''
    Apply PCA on the given dataset
    INPUT: 2D or 3D array dataset
    OUTPUT: 2D array of dataset reduced to n dimensions
    '''
    if len(data_train.shape) != 2:
        data_train = data_train.reshape((data_train.shape[0], data_train.shape[1]**2))
    
    # Need to get the mean of each feature, for mean normalisation/centreing
    data_train_mean = data_train.mean(axis=0)
    data_test_mean = data_test.mean(axis=0)
    # Feature means should now be zero, or approx. close to zero - and hence centred
    data_train_centred = np.subtract(data_train, data_train_mean)
    data_test_centred = np.subtract(data_test, data_test_mean)
    
    # Checking the following, we can see that the max and min value of the entire matrix is 0 and 1
    # hence scaling is not required
    print(data_train.min())
    print(data_train.max())
    print(data_test.min())
    print(data_test.max())
    
    covariance_matrix = (data_train_centred.T).dot(data_train_centred)
    l, V = np.linalg.eig(covariance_matrix)
    
    sorted_lambda_index =  l.argsort()[::-1] # sorting our lambda values from largest to smallest
    
    V_n = V[:,sorted_lambda_index[:n_components]]
    
    # Do the projection of the image matrix against our orthogonal eigenvector matrix reduced to n columns
    pca_data_train = data_train_centred.dot(V_n)
    pca_data_test = data_test_centred.dot(V_n)
    
    return (pca_data_train, pca_data_test)

#### SVD

### Classification Algorithms

#### k-Nearest Neighbours

In [14]:
def knn(data_train, label_train, data_test, K=3):
    '''
    k-Nearest Neighbour classifier
    INPUT: 2D/3D array of training dataset (data_train),
        1D array of label of training dataset (label_train),
        2D/3D array of the dataset to be predicted (data_test),
        (optional) K number of nearest neighbours
    OUTPUT: 1D array of predicted results with the same length as data_test.shape[0]
    '''
    
    # Reshaping our input data, to ensure it's 2D
    if len(data_train.shape) != 2:
        data_train = data_train.reshape((data_train.shape[0], data_train.shape[1]**2))
    if len(data_test.shape) != 2:
        data_test = data_test.reshape((data_test.shape[0], data_test.shape[1]**2))
        
    # Instantiating our empty array for predicted values
    pred_test = np.zeros(data_test.shape[0])
    
    for image_num in range(data_test.shape[0]):
        # Calculating the distance difference between the test subject and all our training points
        sum_sqrd_distances = np.sqrt((np.square(np.subtract(data_train, data_test[image_num]))).sum(axis=1))
        #sum_sqrd_distances = np.linalg.norm(data_train - data_test[image_num], axis=1)
    
        # Getting the k nearest neighbours
        k_nearest_neighbours = (np.argsort(sum_sqrd_distances))[:K]
    
        classes_dict = {}

        # Using weighted distance, instead of simply using count
        for neighbour_idx in k_nearest_neighbours:
            classification = label_train[neighbour_idx]
            if classification in classes_dict:
                classes_dict[classification] += 1/(sum_sqrd_distances[neighbour_idx]**2)
            else:
                classes_dict[classification] = 1/(sum_sqrd_distances[neighbour_idx]**2)
            
        pred_class = None
        for key in classes_dict:
            if pred_class == None:
                pred_class = key
                continue

            if classes_dict[key] > classes_dict[pred_class]:
                pred_class = key
                
        pred_test[image_num] = pred_class
            
    return pred_test

In [15]:
# k-Nearest Neighbours Classifier using raw data as input
print(f"Started at: {datetime.now()}")
knn_results = knn(data_train, label_train, data_test, K=5)
print(f"Finished at: {datetime.now()}")

# k-Nearest Neighbours Classifier with PCA
print(f"Started at: {datetime.now()}")
pca_data_train, pca_data_test = apply_pca(data_train, data_test, n_components=100)
knn_pca_results = knn(pca_data_train, label_train, pca_data_test, K=5)
print(f"Finished at: {datetime.now()}")

Started at: 2020-10-14 09:19:39.572564
Finished at: 2020-10-14 09:28:46.866907
Started at: 2020-10-14 09:28:46.866907
0.0
1.0
0.0
1.0
Finished at: 2020-10-14 09:30:02.559619


In [16]:
correct = 0

for n in range(label_test.shape[0]):
    if knn_results[n] == label_test[n]:
        correct += 1
        
print(f"Accuracy result for kNN (raw) is: {correct/label_test.shape[0]}")

correct = 0

for n in range(label_test.shape[0]):
    if knn_pca_results[n] == label_test[n]:
        correct += 1
        
print(f"Accuracy result for kNN (PCA) is: {correct/label_test.shape[0]}")

Accuracy result for kNN (raw) is: 0.8275
Accuracy result for kNN (PCA) is: 0.8375


#### Naive Bayes

In [7]:
def gaussian_naive_bayes(data_train, label_train, data_test):
    '''
    Gaussian Naive Bayes classifier
    INPUT: 2D/3D array of training dataset (data_train),
        1D array of label on training dataset (label_train),
        2D/3D array of test dataset (data_test)
    OUTPUT: 1D array of predicted classes on test dataset
    '''
    
    # Reshaping if it's not the expected shape (2D)
    if len(data_train.shape) != 2:
        data_train = data_train.reshape((data_train.shape[0], data_train.shape[1]**2))
    if len(data_test.shape) != 2:
        data_test = data_test.reshape((data_test.shape[0], data_test.shape[1]**2))

    # Obtaining the different classes that we have present in our training data and getting index positions of each one
    class_indices = {}
    for idx, image_class in enumerate(label_train):
        if image_class not in class_indices:
            class_indices[image_class] = [idx]
            continue
        else:
            class_indices[image_class].append(idx)
        
    class_mean = {}
    class_var = {}

    # Obtain the mean and std dev for each class of our training data
    for class_index in class_indices:
        class_mean[class_index] = data_train[class_indices[class_index], :].mean(axis=0)
        class_var[class_index] = data_train[class_indices[class_index], :].var(axis=0)

    pred_test = np.zeros(data_test.shape[0])

    for image_num in range(data_test.shape[0]):
        # In order to find the length of pred_class_scores, we need to get the max value of the keys
        # with the assumption that each number up to the max will be a class
        # we do this instead of length because our training data may not have an entry for a class, hence, it'll
        # result in out of range if a data exists for one higher
        pred_class_scores = np.zeros(max(class_indices, key=int)+1)
        
        for class_index in class_indices:
            
            # Calculating the logged prior probability
            class_prob = np.log(len(class_indices[class_index])/data_train.shape[0])

            # Calculating the sum of the logged conditional probability
            likelihood_array = st.norm.logpdf(x=data_test[image_num], loc=class_mean[class_index], scale=np.sqrt(class_var[class_index]))
            class_prob = class_prob + np.nansum(likelihood_array) # we use nansum to avoid nan likelihoods, because these are obtained from points with zero variance

            # Storing the result in our results array, so we can keep track of which class has the highest
            pred_class_scores[class_index] = class_prob

        # Class with the highest prob is the predicted class for the image, which is stored in our final pred_test array
        pred_test[image_num] = np.nanargmax(pred_class_scores)
        
    return pred_test

In [12]:
# Gaussian Naive Bayes using raw data as input
print(f"Started at: {datetime.now()}")
nb_results = gaussian_naive_bayes(data_train, label_train, data_test)
print(f"Finished at: {datetime.now()}")

# Gaussian Naive Bayes applied on principal components of dataset
print(f"Started at: {datetime.now()}")
pca_data_train, pca_data_test = apply_pca(data_train, data_test, n_components=80)
nb_pca_results = gaussian_naive_bayes(pca_data_train, label_train, pca_data_test)
print(f"Finished at: {datetime.now()}")

Started at: 2020-10-14 09:17:52.016334
Finished at: 2020-10-14 09:18:02.679172
Started at: 2020-10-14 09:18:02.679172
0.0
1.0
0.0
1.0
Finished at: 2020-10-14 09:18:11.127397


In [13]:
correct = 0

for n in range(label_test.shape[0]):
    if nb_results[n] == label_test[n]:
        correct += 1
        
print(f"Accuracy result for NB (raw) is: {correct/label_test.shape[0]}")

correct = 0

for n in range(label_test.shape[0]):
    if nb_pca_results[n] == label_test[n]:
        correct += 1
        
print(f"Accuracy result for NB (PCA) is: {correct/label_test.shape[0]}")

Accuracy result for NB (raw) is: 0.655
Accuracy result for NB (PCA) is: 0.757
