# IE7275 - PCA (eigen-decomposition approach)

### We demonstrate how to solve a variance mazimization problem to find principal components for Iris data

By Yilin Yin and Chun-An Chou

In [1]:
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy as np

# import a library to eigenvalue-eigenvector problem.
from numpy.linalg import eig

In [4]:
# load the iris dataset from the cloud database
load iris = datasets.load_iris()
X = iris.data[:, :4]

In [7]:
# Normalize the data by substracting mean for individual variable columns
Mu = np.mean(X.T,axis= 1)
C = X - Mu

In [9]:
# compute the covariance matrix
Cov = np.cov(C.T)
print("\n Covariance:\n", Cov)


 Covariance:
 [[ 0.68569351 -0.042434    1.27431544  0.51627069]
 [-0.042434    0.18997942 -0.32965638 -0.12163937]
 [ 1.27431544 -0.32965638  3.11627785  1.2956094 ]
 [ 0.51627069 -0.12163937  1.2956094   0.58100626]]


In [13]:
# call library Eig to solve for eigenvalues and eigenvector
Evalues, Evector = eig(Cov)

In [14]:
Evalues

array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])

In [15]:
Evector

array([[ 0.36138659, -0.65658877, -0.58202985,  0.31548719],
       [-0.08452251, -0.73016143,  0.59791083, -0.3197231 ],
       [ 0.85667061,  0.17337266,  0.07623608, -0.47983899],
       [ 0.3582892 ,  0.07548102,  0.54583143,  0.75365743]])

components: 
 [[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]
 [-0.58202985  0.59791083  0.07623608  0.54583143]
 [-0.31548719  0.3197231   0.47983899 -0.75365743]]

 variance:
 [0.92461872 0.05306648 0.01710261 0.00521218]


### We confirm the total information in data are the same before and after PCA

In [18]:
# compute the total variance (information) from the covariance matrix
Cov.trace(offset=0, axis1=0, axis2=1, dtype=None, out=None)

4.572957046979869

In [20]:
# compute the sum of eigenvalues
sum(Evalues)

In [None]:
# We verify the PCA results with the Python library

In [21]:
pca = PCA(n_components = 4)

X_pca = pca.fit_transform(X)
print("components: \n",pca.components_)

explained_variance = pca.explained_variance_ratio_
print("\n variance:\n",explained_variance)

components: 
 [[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]
 [-0.58202985  0.59791083  0.07623608  0.54583143]
 [-0.31548719  0.3197231   0.47983899 -0.75365743]]

 variance:
 [0.92461872 0.05306648 0.01710261 0.00521218]
