### Importing Libraries

In [8]:
import numpy as np
import pandas as pd
import cv2 as cv
import glob

In [9]:
# path if in local 
path = './archive/s*'

# path if in colab
# path = '/content/drive/MyDrive/Pattern/eigen-faces/archive/s*'

### 1. Download the Dataset and Understand the Format (10 Points)
### 2. Generate the Data Matrix and the Label vector (10 Points)

In [10]:
N = 10304
D = np.ones((1, N))

dirs = glob.glob(path)
y = np.array([])
for i, dir in enumerate(dirs, 1):
    for f in glob.glob(dir+"/*"):
        img = cv.imread(f, -1)
        img = np.reshape(img, (1, N))
        D = np.append(D, img, axis=0)
        y = np.append(y, i)

D = D[1:,]  # skip first row (1*N)

# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# D_scaled = scaler.fit_transform(D)

# D = D_scaled
print(D.shape)
y = np.array(y)
print(y.shape)

(400, 10304)
(400,)


### 3. Split the Dataset into Training and Test sets (10 Points)

In [6]:
# training => odd
# testing => even
d_train, d_test = D[1::2], D[0::2]
y_train, y_test = y[1::2], y[0::2]


### 4. Classification using PCA (30 points)

In [5]:
d_train.shape

(200, 10304)

In [6]:
# compute mean
mean = np.mean(d_train,axis=0, keepdims=True)
print("Mean shape:", mean.shape)

# center data
Z = d_train - mean
print("Z shape:", Z.shape)

# compute covariance matrix
cov = np.cov(Z.T, bias=True)

# make sure covariance matrix is correct
# std = np.std(Z, axis=0)
# print("Covariance:", cov)
# print("Variance of each diagonal:", std * std)

# compute eigenvalues & eigenvectors
eival, eivec = np.linalg.eigh(cov)

# sort eigenval in descending order & corresponding eigenvec
idx = eival.argsort()[::-1]   
eival = eival[idx]
new_eivec = eivec[:,idx]

# add eigenvalues until explained variance >= alpha
eival_sum = np.sum(eival)

Mean shape: (1, 10304)
Z shape: (200, 10304)


In [7]:
alphas = [0.8, 0.85, 0.9, 0.95]
proj_matrices = []
for alpha in alphas:
    exp_var = 0
    i = 0
    while(alpha > exp_var):
        exp_var += eival[i] / eival_sum
        i += 1
    proj_matrix = new_eivec[:, :i]
    proj_matrices.append(proj_matrix)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

scores = []

knn_org = KNeighborsClassifier(n_neighbors=1)
knn_org.fit(d_train, y_train)
score_org = knn_org.score(d_test, y_test)

print(score_org)


for proj_matrix in proj_matrices:
    d_train_new = np.dot(d_train, proj_matrix)
    d_test_new = np.dot(d_test, proj_matrix)
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(d_train_new, y_train)
    score = knn.score(d_test_new, y_test)
    scores.append(score)
       
print(scores)


0.965
[0.965, 0.965, 0.96, 0.965]


### 5. Classification using LDA (30 points)

In [11]:
# calculate class means
means = np.ones((40, N))

class_matrices = []

for i in range(40):
    class_matrix = d_train[i*5:(i+1)*5,:]
    mean = np.mean(class_matrix, axis=0, keepdims=True)
    means[i] = mean
    class_matrices.append(class_matrix)

overall_mean = np.mean(d_train, axis=0, keepdims=True)
#print(means)

In [12]:
# calculate between-class scatter matrix
nk = 5
Sb = np.zeros((N, N))
for i, mean in enumerate(means):
    mean = mean.reshape(N, 1)
    overall_mean = overall_mean.reshape(N, 1)
    Sb += nk * np.dot((mean - overall_mean), (mean - overall_mean).T)

In [13]:
# calculate center class matrics
Zs = []
for i in range(40):
    Z = class_matrices[i] - means[i]
    Zs.append(Z)

In [None]:
# calculate class scatter matrices
Ss = []
for i in range(40):
    Si = np.dot(Z)