# Principal Component Analysis

Reference:

_Zaki, Mohammed J., Wagner Meira Jr, and Wagner Meira. Data mining and analysis: fundamental concepts and algorithms. Cambridge University Press, 2014._

Chapter 7 - Dimensionality R

In [1]:
%matplotlib inline

import numpy as np
import scipy as sc
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
SIGMA = np.array([[0.681, -0.039, 1.265], [-0.039, 0.187, -0.320], [1.265, -0.320, 3.092]]) # Covariance matrix.

eigvalues, eigvecs = np.linalg.eig(SIGMA) # Compute eigenvalues and eigenvectors.

eigvalues_indsort = np.argsort(eigvalues)[::-1]
eigvalues = eigvalues[eigvalues_indsort]
eigvecs = np.transpose(np.transpose(eigvecs)[eigvalues_indsort])

print(eigvalues)
print(eigvecs)

[3.66150223 0.23962849 0.05886927]
[[-0.39013364  0.63894319  0.66298367]
 [ 0.08878534  0.7427908  -0.66361058]
 [-0.91646763 -0.20003358 -0.34651646]]


## First principal component

In [3]:
firstpc_var = eigvalues[0] # First principal component variance.
firstpc = np.transpose(eigvecs)[0] # First principal component.

print("First eigenvalue: {}".format(firstpc_var))
print(firstpc)

mse_firstpc = np.trace(SIGMA) - firstpc_var # MSE associated with the first principal component.

mse_firstpc

First eigenvalue: 3.6615022318061774
[-0.39013364  0.08878534 -0.91646763]


0.29849776819382257

## Second principal component

In [4]:
secondpc_var = eigvalues[1] # Second principal component variance.
secondpc = np.transpose(eigvecs)[1] # Second principal component.

print("Second eigenvalue: {}".format(secondpc_var))
print(secondpc)

mse_secondpc = np.trace(SIGMA) - firstpc_var - secondpc_var # MSE associated with the first principal component.

mse_secondpc

Second eigenvalue: 0.2396284937166691
[ 0.63894319  0.7427908  -0.20003358]


0.058869274477153455

## Third principal component

In [5]:
thirdpc_var = eigvalues[2] # Third principal component variance.
thirdpc = np.transpose(eigvecs)[2] # Third principal component.

print("Third eigenvalue: {}".format(thirdpc_var))
print(thirdpc)

# MSE associated with the first principal component.
mse_thirdpc = np.trace(SIGMA) - firstpc_var - secondpc_var - thirdpc_var

mse_thirdpc

Third eigenvalue: 0.05886927447715408
[ 0.66298367 -0.66361058 -0.34651646]


-6.245004513516506e-16

# Kernel Component Analysis

In [68]:
from sklearn.datasets import load_iris

In [153]:
iris = load_iris()

sepal_length = iris.data[:, 0]
sepal_width = iris.data[:, 1]

In [154]:
X1 = 0.2 * (sepal_length**2) + (sepal_width**2) + 0.1 * sepal_length * sepal_width
X2 = sepal_width

In [155]:
X = np.transpose(np.array([X1, X2]))

In [156]:
X = sk.preprocessing.scale(X, axis=0, with_std=False)

In [157]:
covariance_matrix = np.dot(np.transpose(X), X)/X.shape[0]

# covariance_matrix = np.cov(X, rowvar = False)

In [158]:
eigvalues, eigvecs = np.linalg.eig(covariance_matrix) # Compute eigenvalues and eigenvectors.

eigvalues_indsort = np.argsort(eigvalues)[::-1]
eigvalues = eigvalues[eigvalues_indsort]
eigvecs = np.transpose(np.transpose(eigvecs)[eigvalues_indsort])

print(eigvalues)

print(eigvecs)

[12.39104468  0.07302727]
[[ 0.99537316 -0.09608476]
 [ 0.09608476  0.99537316]]


In [143]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

model = pca.fit(X)

model.components_

array([[ 0.99537316,  0.09608476],
       [ 0.09608476, -0.99537316]])