In [97]:
import cv2
import numpy as np
import os
import re

from collections import defaultdict
from sklearn.metrics import accuracy_score

In [98]:
# Distance Measurement
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# 1NN Implement
## INPUT: data: a list of image numpy array
##       labels: a list of labels
##        input: a list of unlabeled data
## OUTPUT: Predicted Lable
def OneNN(data, labels, input, distance=euclidean_distance):
    distances = [distance(input, train_point) for train_point in data]
    nearest_index = np.argmin(distances)
    return labels[nearest_index]

# Stratified Kfold Implement: Ensure that the proportion of categories in each fold is equal to 
#                             the proportion of categories in the original data source set
## INPUT: X: a list of data
##        y: a list of labels
##        k: kfold
##        random_seed: fix randomness
## OUTPUT: yield a iterable generator, containing train indices and valid/test indices
def stratified_kfold(X, y, k=5, random_seed=None):
    if random_seed is not None:
        np.random.seed(random_seed)
    
    _, class_indices = np.unique(y, return_inverse=True)
    class_counts = defaultdict(list)
    
    for idx, class_idx in enumerate(class_indices):
        class_counts[class_idx].append(idx)

    folds = [[] for _ in range(k)]
    
    for class_idx, indices in class_counts.items():
        np.random.shuffle(indices)
        
        for i, idx in enumerate(indices):
            folds[i % k].append(idx)

    for i in range(k):
        test_indices = folds[i]
        train_indices = [idx for j in range(k) if j != i for idx in folds[j]]
        yield train_indices, test_indices

In [99]:
# Read in Dataset and Labels
filename = "ATT"
files = [f for f in os.listdir(filename) if '.png' in f]
dataset = [cv2.imread(filename + "/" + f, cv2.IMREAD_GRAYSCALE) for f in files]
labels = [re.search(r'(\d+)_', f).group(1) for f in files]

In [None]:
# For test set in each fold, do 1NN with the rest train set
acc = []
for trainidx, testidx in stratified_kfold(dataset, labels, random_seed=42):
    x = [dataset[i] for i in trainidx]
    y = [labels[i] for i in trainidx]
    testdata = [dataset[i] for i in testidx]
    y_pred = [OneNN(x, y, t) for t in testdata]
    y_true = [labels[t] for t in testidx]
    acc.append(accuracy_score(y_true, y_pred))

The Average Prediction Accuracy is  0.99


In [102]:
# Result
print('The Average Prediction Accuracy is ', np.mean(acc))

The Average Prediction Accuracy is  0.99
