# SVM for Classification

Use the raw face images (vectorized) and the face vectors after PCA pre-processing (with di- mensionality of 80 and 200) as inputs to linear SVM. Try values of the penalty parameter C in {1 × 10<sup>-2</sup>, 1 × 10<sup>-1</sup>, 1}. Report the classification accuracy with different parameters and dimensions. Discuss the effect of data dimension and parameter C on the final classification accuracy.

### Read all the images

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits import mplot3d

In [2]:
def read_images():
    dir='PIE'
    PIE=os.listdir(dir)
    all_img = []
    img_folders=[dir+'/' + photo for photo in PIE]
    selected_subjects = np.random.randint(1,len(img_folders),20)
    
    selected_sub_folders = [img_folders[index] for index in selected_subjects]
    print('Selected Subjects:')
    print(selected_sub_folders)

    for img_dir in selected_sub_folders:
        img_path=os.listdir(img_dir)
        img_dir_index=[img_dir+'/' + photo for photo in img_path]
        all_img.append(img_dir_index)

    flat_list = []
    for sublist in all_img:
        for item in sublist:
            flat_list.append(item)
    
    images = np.array([plt.imread(image) for image in flat_list], dtype=np.int16)
    labels = np.zeros((len(images)))

    for i, path in enumerate(flat_list):
        labels[i] = path.split('/')[1]
    return images, labels

In [3]:
images, labels = read_images()
n_samples, image_size, _ = images.shape
n_features = image_size * image_size

Selected Subjects:
['PIE/36', 'PIE/4', 'PIE/40', 'PIE/57', 'PIE/49', 'PIE/20', 'PIE/41', 'PIE/45', 'PIE/14', 'PIE/55', 'PIE/54', 'PIE/29', 'PIE/59', 'PIE/12', 'PIE/1', 'PIE/10', 'PIE/67', 'PIE/8', 'PIE/59', 'PIE/68']


### Randomly select 500 images

In [4]:
def select_images(images, labels, select_num):
    num_images = len(images)
    np.random.seed(99)
    if select_num == len(images):
        return images.reshape([select_num, n_features]), labels
    rand_images_list = np.random.randint(0, num_images, select_num)
    X = images[rand_images_list].reshape([select_num, n_features])
    y = labels[rand_images_list]
    return X, y

In [11]:
X, y = select_images(images, labels, n_samples)

### PCA

In [23]:
def PCA(X):
    mean_data = np.mean(X,axis=0)
    centred_data = X - mean_data
    cov_matrix = np.cov(centred_data.T)
    [eig_val,eig_vec] = np.linalg.eig(cov_matrix)
    projected = np.dot(eig_vec, centred_data.T)
    return eig_val, eig_vec

In [24]:
def get_centred_data(X):
    mean_data = np.mean(X,axis=0)
    centred_data = X - mean_data
    return centred_data

In [25]:
def get_reduced_dim_data(eig_vec, X, dim):
    pca_vec = eig_vec[:,0:dim]
    return np.dot(X, pca_vec)

### Classifying the test images using the rule of nearest neighbor

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [27]:
X, y = select_images(images, labels, n_samples)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [37]:
dimensions = [40,80,200]
C = [pow(10,-2), pow(10,-1), 1]

In [None]:
for dim in dimensions:
    for c in C:
        print('Dimension: {}, C: {}'.format(dim, c))
        # reduce the dimensionality of training data
        eig_val, eig_vec = PCA(X_train)
        centred_data = get_centred_data(X_train)
        X_pca_train = get_reduced_dim_data(eig_vec, centred_data, dim).real

        # reduce the dimensionality of test data
        centred_data = get_centred_data(X_test)
        X_pca_test = get_reduced_dim_data(eig_vec, centred_data, dim).real

        # Create KNN classifier
        clf = LinearSVC(random_state=0, C=c, max_iter=10000)
        # Fit the classifier to the data
        clf.fit(X_pca_train, y_train)
        y_pred = clf.predict(X_pca_test)
        score = clf.score(X_pca_test, y_test)
        print(score)

Dimension: 40, C: 0.01
