## PCA using eigen decomposition:

In [1]:
import pandas as pd
import numpy as np

In [2]:
_d = [[0,0],[1,2], [2, 3], [3,6], [4,8], [5,9]]
columns = ['x', 'y']
data = pd.DataFrame(_d, columns=columns)
data

Unnamed: 0,x,y
0,0,0
1,1,2
2,2,3
3,3,6
4,4,8
5,5,9


### Calculate covariance matrix: 

In [3]:
covariance_matrix = np.cov(data.T)

In [4]:
covariance_matrix

array([[ 3.5       ,  6.6       ],
       [ 6.6       , 12.66666667]])

### Calculate eigen decomposition:

In [5]:
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

In [6]:
print(eigen_values)
print(eigen_vectors)

[ 0.04797743 16.11868923]
[[-0.88611393 -0.46346747]
 [ 0.46346747 -0.88611393]]


In [7]:
# ===> sorting order based on eigen_values
_idx = eigen_values.argsort()[::-1]
eigen_values = eigen_values[_idx]
eigen_vectors = eigen_vectors[:, _idx]
eigen_vectors

array([[-0.46346747, -0.88611393],
       [-0.88611393,  0.46346747]])

In [8]:
new_data = np.linalg.inv(eigen_vectors) @ data.T
new_data = pd.DataFrame(new_data.T)
new_data.columns = ['x', 'y']
new_data

Unnamed: 0,x,y
0,0.0,0.0
1,-2.235695,0.040821
2,-3.585277,-0.381825
3,-6.707086,0.122463
4,-8.942781,0.163284
5,-10.292363,-0.259362


In [9]:
np.var(new_data)

x    13.432241
y     0.039981
dtype: float64

In [10]:
100 * np.var(new_data) / np.var(new_data).sum()

x    99.703232
y     0.296768
dtype: float64

### Check the values with PCA:

In [11]:
from sklearn.decomposition import PCA

In [12]:
pca = PCA(svd_solver='randomized', random_state=100)
pca.fit(data)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=100,
  svd_solver='randomized', tol=0.0, whiten=False)

In [13]:
pca.components_

array([[-0.46346747, -0.88611393],
       [ 0.88611393, -0.46346747]])

In [14]:
pca.explained_variance_

array([16.11868923,  0.04797743])

In [15]:
pca.explained_variance_ratio_

array([0.99703232, 0.00296768])

In [16]:
ratings = pd.read_csv("https://cdn.upgrad.com/UpGrad/temp/4be1d70d-c298-4cd3-a6b9-9e1d35e823e8/Ratings.csv")
ratings.head()

Unnamed: 0,B1,B2,B3,B4,B5
0,1,0,4,0,3
1,2,3,4,3,2
2,3,3,2,4,2
3,4,4,3,5,4
4,5,1,4,2,2


In [17]:
pca2 = PCA(svd_solver='randomized', random_state=10)
pca2.fit(ratings)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=10,
  svd_solver='randomized', tol=0.0, whiten=False)

In [18]:
pca2.components_

array([[-0.31118649, -0.48101502,  0.35622281, -0.69199674, -0.25696953],
       [ 0.90672615, -0.31113879,  0.25507174, -0.09225738,  0.08641369],
       [-0.12857449, -0.38163517, -0.34923551, -0.16484744,  0.82986931],
       [ 0.1452065 , -0.34290409, -0.79235694, -0.05662208, -0.47989194],
       [-0.20831306, -0.63920934,  0.24134777,  0.69444104, -0.08671783]])

In [19]:
pca2.explained_variance_ratio_.round(3)

array([0.652, 0.153, 0.131, 0.064, 0.   ])