Objective:

Program PCA from scratch using the iris data. May only use numpy, pandas, and matplotlib

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#load iris data
from sklearn.datasets import load_iris

In [4]:
dataset = load_iris()
dataset.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
#construct dataframe
data = pd.DataFrame(dataset.data, columns = dataset.feature_names)
data['target'] = dataset.target

In [6]:
print(data.shape)
data.head()

(150, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### General procedure

1. Subtract each variable's mean elementwise
2. Cacluate the covariance matrix
3. Calcualte eigenvalues and eigen vectors of the covariance matrix
4. Choose components
5. Transfrom data

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


# create new dataframe pca_df that houses the transformed data
#remove target data
pca_df = data.iloc[:, :-1].copy()

#pca_df = scaler.fit_transform(pca_df)

# subtract the mean from each feature row
pca_df = pd.DataFrame(pca_df).apply(lambda x : x - x.mean())
print(pca_df.shape)
pca_df.head()

(150, 4)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.743333,0.442667,-2.358,-0.999333
1,-0.943333,-0.057333,-2.358,-0.999333
2,-1.143333,0.142667,-2.458,-0.999333
3,-1.243333,0.042667,-2.258,-0.999333
4,-0.843333,0.542667,-2.358,-0.999333


In [8]:
#calcuate the covariance matrix
#transpose data into row format
covariance = np.cov(pca_df.T)
covariance

array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
       [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
       [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
       [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])

In [9]:
#use covariance matrix to calcualte the eiggen vecors and eigenvalues
# to use np.linalg.eig matrix must be square (n,n)
covariance.shape

(4, 4)

In [10]:
eig_vals, eig_vects = np.linalg.eig(covariance)

eig_vals.shape, eig_vects.shape

((4,), (4, 4))

In [20]:
#for step 4 we arrange the eigen values. 
#while sorting put everything in a dataframe to keep things ordered
#transpose it so that the first column is the eigen values
eigen_soup = pd.DataFrame(eig_vects.T)
eigen_soup['values'] = eig_vals
eigen_soup.head(10)

Unnamed: 0,0,1,2,3,values
0,0.361387,-0.084523,0.856671,0.358289,4.228242
1,-0.656589,-0.730161,0.173373,0.075481,0.242671
2,-0.58203,0.597911,0.076236,0.545831,0.07821
3,0.315487,-0.319723,-0.479839,0.753657,0.023835


In [12]:
#eigen values are already in descending order.
#just for practice order the matrix
eigen_ordered = eigen_soup.sort_values('values', axis=0, ascending = False)


In [13]:
#look at the first few entries.
#the eigenvalues in the frist 2 rows are the most substancial 
#(15 times order of magnitue from the next largest)
#therefore use first 2 components to transform pca_df
#could also take only first two components
eigen_ordered['values'].head()

0    4.228242
1    0.242671
2    0.078210
3    0.023835
Name: values, dtype: float64

In [14]:
vector_transform = eigen_ordered.iloc[:2, :-1]
vector_transform.shape

(2, 4)

In [15]:
vector_transform.head()

Unnamed: 0,0,1,2,3
0,0.361387,-0.084523,0.856671,0.358289
1,-0.656589,-0.730161,0.173373,0.075481


In [16]:
print('Vector transform dimensions {}'.format(vector_transform.shape))
print('X adjusted dimensions {}'.format(pca_df.shape))

Vector transform dimensions (2, 4)
X adjusted dimensions (150, 4)


In [23]:
#to arrive at the pca we calculate vector transform * X.T
#expected dimensions (3, 150) transposed to (150,3)

PCA_result = np.dot(vector_transform, pca_df.T).T

ValueError: matrices are not aligned

In [18]:
PCA_result = pd.DataFrame(PCA_result)
PCA_result.shape

(150, 2)

In [160]:
PCA_result.head()

Unnamed: 0,0,1
0,0.497869,-1.350754
1,0.753886,-0.968768
2,0.608494,-1.157687
3,0.521608,-0.956637
4,0.396071,-1.415317


In [89]:
#check the results
from sklearn.decomposition import PCA

In [161]:
pca = PCA(n_components = 2)
data_check = data.iloc[:,:-1].copy()
pca.fit(data_check)
pca_check = pd.DataFrame(pca.transform(data_check))
pca_check.head()

Unnamed: 0,0,1
0,-2.684126,0.319397
1,-2.714142,-0.177001
2,-2.888991,-0.144949
3,-2.745343,-0.318299
4,-2.728717,0.326755
