In [90]:
import cv2
import numpy as np
import os
import re

from collections import defaultdict
from sklearn.metrics import accuracy_score

In [91]:
# Distance Measurement
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# 1NN Implement
## INPUT: data: numpy array of shape (n_samples, n_features), training data
##       labels: list, training data labels
##        input: numpy array of shape (x_samples, n_features), unlabeled data
## OUTPUT: Predicted Lable
def OneNN(data, labels, input, distance=euclidean_distance):
    predicted_labels = []
    for inp in input:
        distances = np.array([distance(inp, d) for d in data])
        nearest_index = np.argmin(distances)
        predicted_labels.append(labels[nearest_index])
    
    return predicted_labels

# Stratified Kfold Implement: Ensure that the proportion of categories in each fold is equal to 
#                             the proportion of categories in the original data source set
## INPUT: X: a list of data
##        y: a list of labels
##        k: kfold
##        random_seed: fix randomness
## OUTPUT: yield a iterable generator, containing train indices and valid/test indices
def stratified_kfold(X, y, k=5, random_seed=None):
    if random_seed is not None:
        np.random.seed(random_seed)
    
    _, class_indices = np.unique(y, return_inverse=True)
    class_counts = defaultdict(list)
    
    for idx, class_idx in enumerate(class_indices):
        class_counts[class_idx].append(idx)

    folds = [[] for _ in range(k)]
    
    for class_idx, indices in class_counts.items():
        np.random.shuffle(indices)
        
        for i, idx in enumerate(indices):
            folds[i % k].append(idx)

    for i in range(k):
        test_indices = folds[i]
        train_indices = [idx for j in range(k) if j != i for idx in folds[j]]
        yield train_indices, test_indices

In [92]:
class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.mean = None
        self.components = None

    # Reduce matrix size using A.T @ A instead of A @ A.T
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        cov = np.cov(X_centered, rowvar=True)
        eigen_values, eigen_vectors = np.linalg.eig(cov)

        sorted_indices = np.argsort(eigen_values)[::-1]
        eigen_vectors = eigen_vectors[:, sorted_indices]
        self.components = (X_centered.T @ eigen_vectors)[:, :self.n_components]
    def transform(self, X):
        if self.components is not None:
            X_centered = X - self.mean
            return X_centered @ self.components
        return None


In [93]:
# Read in Dataset and Labels
filename = "ATT"
files = [f for f in os.listdir(filename) if '.png' in f]
dataset = [cv2.imread(filename + "/" + f, cv2.IMREAD_GRAYSCALE) for f in files]
dataset = [np.array(data, dtype=np.float64).flatten(order="C") for data in dataset]
labels = [re.search(r'(\d+)_', f).group(1) for f in files]

In [94]:
# In each fold, let pca fit train set with components 60, then use pca to transform train and test set
# Then do 1NN on test set
acc = []
for i, (trainidx, testidx) in enumerate(stratified_kfold(dataset, labels, random_seed=42)):
    x = np.array([dataset[i] for i in trainidx])
    y = [labels[i] for i in trainidx]
    testdata = np.array([dataset[i] for i in testidx])
    
    pca = PCA(60)
    pca.fit(x)
    x = pca.transform(x)
    testdata = pca.transform(testdata)

    y_pred = OneNN(x, y, testdata)
    y_true = [labels[t] for t in testidx]
    acc.append(accuracy_score(y_true, y_pred))
    print(f'{i}th epoch gets accuracy {accuracy_score(y_true, y_pred)}')

0th epoch gets accuracy 0.9875
1th epoch gets accuracy 0.925
2th epoch gets accuracy 0.95
3th epoch gets accuracy 0.9375
4th epoch gets accuracy 0.9625


In [95]:
# Result
print('The Average Prediction Accuracy is ', np.mean(acc))

The Average Prediction Accuracy is  0.9525
