# Q1 - PCA

## Import the libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline

## (A) Load the Data

In [None]:
def data_and_headers(filename):
    data = None
    with open(filename) as fp:
        data = [x.strip().split(',') for x in fp.readlines()]
    headers = data[0]
    headers = np.asarray(headers)
    class_field = len(headers) - 1
    data_x = [[float(x[i]) for i in range(class_field)] for x in data[1:]]
    data_x = np.asarray(data_x)
    data_y = [[str(x[i]) for i in range(class_field, class_field + 1)] for x in data[1:]]
    data_y = np.asarray(data_y)
    return headers, data_x, data_y

In [None]:
headers, train_x, train_y = data_and_headers('Data' + os.sep + 'hw2q1_train.csv')
headers, test_x, test_y = data_and_headers('Data' + os.sep + 'hw2q1_test.csv')

In [None]:
print('Training Data')
print('Number of features - ' + str(train_x.shape[1]))
print('Number of target features - ' + str(train_y.shape[1]))
print('Number of observations - ' + str(train_x.shape[0]))
print('Number of observations in category R - ' + str(train_y[train_y=='R'].shape[0]))
print('Number of observations in category M - ' + str(train_y[train_y=='M'].shape[0]))
print()
print('Testing Data')
print('Number of features - ' + str(test_x.shape[1]))
print('Number of target features - ' + str(test_y.shape[1]))
print('Number of observations - ' + str(test_x.shape[0]))
print('Number of observations in category R - ' + str(test_y[test_y=='R'].shape[0]))
print('Number of observations in category M - ' + str(test_y[test_y=='M'].shape[0]))

## (B) Normalization and PCA

In [None]:
def normalize(data, minima, maxima):
    normal = np.copy(data)
    normal = (normal - minima) / (maxima - minima)
    return normal

In [None]:
normal_train = normalize(train_x, np.amin(train_x, axis=0), np.amax(train_x, axis=0))
normal_test = normalize(test_x, np.amin(train_x, axis=0), np.amax(train_x, axis=0))

### (i) Covariance of Training Dataset

In [None]:
covariance = np.cov(normal_train, rowvar=False)
fig, axes = plt.subplots(nrows=1, ncols=1)
fig.set_figheight(12)
fig.set_figwidth(15)
im = axes.pcolor(covariance, cmap='CMRmap')
fig.colorbar(im, ax=axes)
plt.show()

### (ii) Eigenvalue and Eigenvectors

In [None]:
print('Size of covariance matrix - ' + str(covariance.shape))
w,v = np.linalg.eig(covariance)
print('Top 5 Eigenvalues - ' + ', '.join(['{:.3f}'.format(x) for x in sorted(w[np.argsort(w)[-5:]])]))

### (iii) Plot of Eigenvalues

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
fig.set_figheight(9)
fig.set_figwidth(24)
axes[0].bar(np.arange(60), w)
axes[1].plot(np.arange(60), np.cumsum(w)/np.sum(w))
plt.show()