In [1]:
import numpy as np

- The first step is to calculate the mean values of each column
  - M = mean(A)
- center the values in each column by subtracting the mean column value  
  - C=A−M
- A covariance matrix is a calculation of covariance of a given matrix
  - V = cov(C)
- calculate the eigendecomposition of the covariance matrix V
  - values, vectors = eig(V )
- select k eigenvectors, called principal components, that have the k largest eigenvalues.
  - B = select(values, vectors)
- Once chosen, data can be projected into the subspace via matrix multiplication
  - P = B^T · A

In [4]:
# principal component analysis

# define matrix
A = np.array([
  [1, 2],
  [3, 4],
[5, 6]])
print(A)
# column means
M = np.mean(A.T, axis=1)
# center columns by subtracting column means 
C=A-M
# calculate covariance matrix of centered matrix 
V = np.cov(C.T)
# factorize covariance matrix
values, vectors = np.linalg.eig(V)
print(vectors)
print(values)
# project data
P = vectors.T.dot(C.T)
print(P.T)

# Interestingly, we can see that only the first eigenvector is required, suggesting that 
# we could project our 3 × 2 matrix onto a 3 × 1 matrix with little loss.

[[1 2]
 [3 4]
 [5 6]]
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]
[8. 0.]
[[-2.82842712  0.        ]
 [ 0.          0.        ]
 [ 2.82842712  0.        ]]


In [5]:
# principal component analysis with scikit-learn
from sklearn.decomposition import PCA
# define matrix
A = np.array([
  [1, 2],
  [3, 4],
  [5, 6]])
print(A)
# create the transform
pca = PCA(2)
# fit transform
pca.fit(A)
# access values and vectors
print(pca.components_)
print(pca.explained_variance_)
# transform data
B = pca.transform(A)
print(B)

[[1 2]
 [3 4]
 [5 6]]
[[ 0.70710678  0.70710678]
 [ 0.70710678 -0.70710678]]
[8.00000000e+00 2.25080839e-33]
[[-2.82842712e+00  2.22044605e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 2.82842712e+00 -2.22044605e-16]]
