**Principal Component Analysis**

You will implement dimensionality reduction with PCA.  

1). Read iris_dataset.csv (4 features, hence 4 PCs)

2). Find the principal components

3). Recontruct the dataset (X_hat)

4). Determine the accuracy of X_hat for 1 PC and 4 PCs using LDA classifier (provided below)


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

df = pd.read_csv("iris_dataset.csv", header=None)
print(df.describe())
data = df.values

                0           1           2           3           4
count  150.000000  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.057333    3.758000    1.199333    2.000000
std      0.828066    0.435866    1.765298    0.762238    0.819232
min      4.300000    2.000000    1.000000    0.100000    1.000000
25%      5.100000    2.800000    1.600000    0.300000    1.000000
50%      5.800000    3.000000    4.350000    1.300000    2.000000
75%      6.400000    3.300000    5.100000    1.800000    3.000000
max      7.900000    4.400000    6.900000    2.500000    3.000000


In [15]:

shuffled_data = data;
np.random.shuffle(shuffled_data)
X = shuffled_data[:,0:4]  # 150x4


In [16]:
def evaluate_performance(Xhat, Num_PC, recon_error):
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  from sklearn.model_selection import cross_val_score

  no_dim = Num_PC 
  X_train = Xhat[:,0:Num_PC]       
  y_train = data[:,4]

  model_mean_scores = []
  model = LinearDiscriminantAnalysis().fit(X_train, y_train)
  scores = cross_val_score(model, X_train, y_train, cv=10)
  model_mean_scores.append(np.mean(scores))

  print('Reconstruction error = {0:0.6f} with {1:1d} PCs, average accuracy = {2:0.4f}'
     .format(recon_error, Num_PC, model_mean_scores[0]))

In [17]:
def create_pc(X, Num_PC):
  u = [np.mean(X[:,0]), np.mean(X[:,1]), np.mean(X[:,2]), np.mean(X[:,3])]
  XM = np.subtract(X,u)
  cov = np.cov(XM.T)
  w, Phi = LA.eig(cov)
  index = np.argsort(w)[::-1]
  w = w[index]
  Phi = Phi[:,index]
  PC = Phi[:,0:Num_PC]
  Y = np.dot(XM,PC)
  Xhat = np.add(np.dot(Y,PC.T),u)
  recon_error = (np. linalg.norm(X)-np.linalg.norm(Xhat))/np.linalg.norm(X)

  return Phi, Xhat, recon_error

In [18]:
PC, X_hat, recon_err = create_pc(X,1)

In [19]:
PC, X_hat, recon_err = create_pc(X,1)
evaluate_performance(X_hat, 1, recon_err)

PC, X_hat, recon_err = create_pc(X,4)
evaluate_performance(X_hat, 4, recon_err)

Reconstruction error = 0.002696 with 1 PCs, average accuracy = 0.9333
Reconstruction error = 0.000000 with 4 PCs, average accuracy = 0.9800
