### PCA IMPLEMENTATION FROM SCRATCH

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import load_breast_cancer

In [6]:
cancerData = load_breast_cancer()
X= cancerData["data"]
X.shape

(569, 30)

## Step 1 : Column standarsaization

In [9]:
mu = X.mean(axis=0) # calculaye mean column wise
X_ = (X-mu)
X_.shape

(569, 30)

### Step 2: Covariance Matrix

In [15]:
S = np.dot(X_.T,X_)  # S = XT.X
S.shape              #It should be 30*30 matrix

(30, 30)

### Step 3: Eigen Values and Eigen Vector / Eigen Values Decomposition

In [54]:
lamdas, vectors = np.linalg.eig(S) 
vectors.shape
# returns w : eigen value's array and v: normalized unit eignen vectors,
# shape of w(lamda) is 30*1, all 30 eigen values are stored in 30 different rows
# shape of v(eigenVectors) is 30*30. If we want vector v1, then we need to consider column 1 since column wise vectors are stores. So there there are 30 features, then there will be 30*30 vectors
# v1 = vectors[:0]
# v2 = vectors[:1]

(30, 30)

In [None]:
v2 = vectors[:2]
v2

In [26]:
# To get top 2 eigen vectors are
V = vectors[:,:2]
V.shape

(30, 2)

## Step 4 : Projection

In [27]:
X_.shape

(569, 30)

In [28]:
V.shape

(30, 2)

In [44]:
X_new = np.dot(X_,V)  #projection will computed using dot product
X_new.shape           # now we will be having 2 features --> 2 columns and 569 samples --> 569 rows

(569, 2)

In [39]:
X_new 

array([[1160.1425737 , -293.91754364],
       [1269.12244319,   15.63018184],
       [ 995.79388896,   39.15674324],
       ...,
       [ 314.50175618,   47.55352518],
       [1124.85811531,   34.12922497],
       [-771.52762188,  -88.64310636]])

### PCA IMPLEMENTATION USING BUILT IN FUNCTION

In [35]:
from sklearn.decomposition import PCA

In [36]:
pca = PCA(n_components=2) 

In [43]:
pca.fit_transform(X) # this dataset obtained from sklearn and X_new dataset obtained in above approach are same

array([[1160.1425737 , -293.91754364],
       [1269.12244319,   15.63018184],
       [ 995.79388896,   39.15674324],
       ...,
       [ 314.50175618,   47.55352518],
       [1124.85811531,   34.12922497],
       [-771.52762188,  -88.64310636]])

### CHOOSING THE RIGHT DIMENSIONS

### Method 1 : using Sklearn

In [47]:
#sklearn provide built-in method called explained_variance_
pca.explained_variance_  # this will give lamda1 and lamda2 values. lamda1 =443782.6051466 and lamda2 = 7310.10006165 

array([443782.6051466 ,   7310.10006165])

In [50]:
pca.explained_variance_ratio_ # This will return how much variances are explained/covered by selected features (here 2 features)

# Here out of 30 features, first feature/lamda1 explains 98.2% variance and sencond feature explains 1.6% variance

array([0.98204467, 0.01617649])

In [53]:
# Total variance coverage considering first two features is : 99.8%
pca.explained_variance_ratio_.sum()

0.998221161374173

### Method 2 : using algorith built above from scratch

In [61]:
# we had stores all eigen values in the lamdas variable.
# explained variance by lamda1 given by : lamda1/Sum of all 30 lamdas values

total = sum(lamdas)
explained_variance = np.round(lamdas/total,2)  #This will give the array which has the explained varaince value for all the lamdas
explained_variance

# lamda1/feature1 --> 0.98 --> 98%
# lamda2 --> 0.02 --> 2%
# lamda3 --> 0 --> 0%

# so in this example considering first two features are enough, becuase rest of the features has 0 variance, so no use of considering them

array([0.98, 0.02, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [67]:
"""
consider below sample example 

lamda1/feature1 --> 0.8 --> 80%
lamda2 --> 0.07 --> 7%
lamda3 --> 0.05 --> 7%
lamda4 --> 0.05 --> 5%

In this example, if we consider only first two features, then the variance covered will be only 87%. 
Remaining 13% variance will be missed. So, to get to know the total number of features required to cover atleast 99%, we can use cummulative sum formula over the lamdas_variance values. 
i.e., Cummulative sum = lamda1_variance, lamda1_variance + lamda2_variance, lamda1_variance + lamda2_variance + lamda3_variance, lamda1_variance + lamda2_variance + lamda3_variance +lamda4_variance ,.... 

"""

cummulative_variance = np.cumsum(explained_variance)
cummulative_variance

# here at second position we are getting value 1. ==> 100%, so we should consider first two features to achieve 100% varinace coverage

array([0.98, 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  ,
       1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  ,
       1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  ])