# Let's compute the PCA step by step

Data preparation

In [None]:
import seaborn as sns 
import numpy as np
import pandas as pd

iris = sns.load_dataset("iris") # Load the iris data set

labels = iris["species"] # (save those labels for later)
del iris["species"] # delete the non-numerical column

iris

### (1) Standardize

In [None]:
iris_s = iris
for col in iris_s.columns:  # standardize
    iris_s.loc[:,col] = (iris_s[col] - iris_s[col].mean()) / iris_s[col].std()

### (2) Compute the principal components

In [None]:
cov = np.dot(iris_s.T, iris_s)/iris.shape[0] # Compute the covariance matrix
eig = np.linalg.eig(cov) # Compute the eigenvalues + eigenvectors
eig # First array is the eigenvalues v1, v2, v3, v4. Second array is the matrix V

In [None]:
eig[0] # eigenvalues

In [None]:
eig[1][:,0] # first eigenvector

In [None]:
eig[1][:,0:2] # Selecting the first two eigenvector

In [None]:
pd.DataFrame(data=np.dot(eig[1],eig[1].T)).round().abs()  # V*V^T is indeed the identity (just checking)

### (3) Use the eigenvectors for change of basis of the data

In [None]:
pd.DataFrame(np.dot(iris_s, eig[1])) # Transform with all eigenvectors

### (4) Discard less significant PCs

In [None]:
pd.DataFrame(np.dot(iris_s, eig[1][:,0:2])) # Transform with the first two eigenvectors

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(5,4))

res = pd.DataFrame(np.dot(iris_s, eig[1][:,0:2]))
sns.scatterplot(res[0], res[1], hue=labels)
plt.legend(title="")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
#plt.savefig("example_iris_pca.pdf")

# Let's compare with ready-made software from sklearn

In [None]:
from sklearn.decomposition import PCA 

n_components = 4
pca = PCA(n_components)
pcs = pca.fit_transform(iris)
pd.DataFrame(data=pcs) # Same results!

In [None]:
pca.explained_variance_ratio_ # explained variance of each component (normalized)

In [None]:
n_components = 2
pca = PCA(n_components)
pcs = pca.fit_transform(iris)
pd.DataFrame(data=pcs) # Same results! 