# Principal Component Analysis (PCA)



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
import plotly.express as px
from sklearn.decomposition import PCA

In [3]:
class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Mean centering
        self.mean = np.mean(X, axis=0)
        X = X - self.mean

        # covariance, function needs samples as columns
        cov = np.cov(X.T)

        # eigenvalues, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)

        # -> eigenvector v = [:,i] column vector, transpose for easier calculations
        # sort eigenvectors
        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]

        # store first n eigenvectors
        self.components = eigenvectors[0 : self.n_components]
        

    def transform(self, X):
        # project data
        #X = X - self.mean
        return np.dot(X, self.components.T)

# Iris Dataset

In [4]:
#2-D Plotly

# data = datasets.load_digits()
iris = datasets.load_iris()
X = iris.data
y = iris.target

y_label = []

for i in y:
  if i == 0:
    y_label.append('Setosa')
  elif i == 1:
    y_label.append('Versicolor')
  else:
    y_label.append('Virginica')

Species = pd.DataFrame(y_label,columns=['Species'])



# Project the data onto the 2 primary principal components
pca = PCA(2)
pca.fit(X)
X_projected = pca.transform(X)


print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2'])
X_y_df = df_scores = pd.concat([X_df, Species], axis=1)

Shape of X: (150, 4)
Shape of transformed X: (150, 2)


In [5]:
fig = px.scatter(X_y_df, x='PC1', y='PC2',color='Species')

fig.show()


In [6]:
# Testing 3-D Plotly

# data = datasets.load_digits()
iris = datasets.load_iris()
X = iris.data
y = iris.target

y_label = []

for i in y:
  if i == 0:
    y_label.append('Setosa')
  elif i == 1:
    y_label.append('Versicolor')
  else:
    y_label.append('Virginica')

Species = pd.DataFrame(y_label,columns=['Species'])

# Project the data onto the 2 primary principal components
pca = PCA(3)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2','PC3'])
X_y_df = df_scores = pd.concat([X_df, Species], axis=1)


Shape of X: (150, 4)
Shape of transformed X: (150, 3)


In [7]:
fig = px.scatter_3d(X_y_df, x='PC1', y='PC2', z='PC3',color='Species')

fig.show()

# Wine Dataset

In [9]:
#2-D Plotly

# data = datasets.load_digits()
wine = datasets.load_wine()
X = wine.data
y = wine.target


y_label = []

for i in y:
  if i == 0:
    y_label.append('Wine Type 1')
  elif i == 1:
    y_label.append('Wine Type 2')
  else:
    y_label.append('Wine Type 3')

WineType = pd.DataFrame(y_label,columns=['WineType'])

# Project the data onto the 2 primary principal components
pca = PCA(2)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2'])
X_y_df = df_scores = pd.concat([X_df, WineType], axis=1)

Shape of X: (178, 13)
Shape of transformed X: (178, 2)


In [10]:
fig = px.scatter(X_y_df, x='PC1', y='PC2',color='WineType')

fig.show()

In [11]:
# Testing 3-D Plotly

# data = datasets.load_digits()
wine = datasets.load_wine()
X = wine.data
y = wine.target

y_label = []

for i in y:
  if i == 0:
    y_label.append('Wine Type 1')
  elif i == 1:
    y_label.append('Wine Type 2')
  else:
    y_label.append('Wine Type 3')

WineType = pd.DataFrame(y_label,columns=['WineType'])

# Project the data onto the 2 primary principal components
pca = PCA(3)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2','PC3'])
X_y_df = df_scores = pd.concat([X_df, WineType], axis=1)
print(X_y_df)

Shape of X: (178, 13)
Shape of transformed X: (178, 3)
             PC1         PC2        PC3     WineType
0   -1067.055687 -108.516254  17.171375  Wine Type 1
1   -1051.590128  -81.659406  13.479274  Wine Type 1
2   -1186.553841  -80.486814  21.415332  Wine Type 1
3   -1481.732848  -87.216853  21.219366  Wine Type 1
4    -736.921280 -105.514118  20.856532  Wine Type 1
..           ...         ...        ...          ...
173  -741.512497  -82.482987  22.776816  Wine Type 3
174  -751.624313  -89.359314  24.612040  Wine Type 3
175  -836.950782 -105.800408  22.539686  Wine Type 3
176  -841.948950 -105.694943  22.090501  Wine Type 3
177  -561.549518  -86.810793  25.932619  Wine Type 3

[178 rows x 4 columns]


In [12]:
fig = px.scatter_3d(X_y_df, x='PC1', y='PC2', z='PC3',color='WineType')

fig.show()

# Heart Disease Dataset

In [13]:
from google.colab import files
uploaded = files.upload()
heart_data= pd.read_csv('heart.csv',header=None,skiprows=1)


Saving heart.csv to heart (1).csv


In [15]:
#2-D Plotly

X,y = heart_data.values[:,:-1], \
heart_data.values[:,-1]


y_label = []

for i in y:
  if i == 0:
    y_label.append('Healthy')
  else:
    y_label.append('Heart Disease')

HealthStatus = pd.DataFrame(y_label,columns=['HealthStatus'])

# Project the data onto the 2 primary principal components
pca = PCA(2)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2'])
X_y_df = df_scores = pd.concat([X_df, HealthStatus], axis=1)
print(X_y_df)


Shape of X: (303, 13)
Shape of transformed X: (303, 2)
            PC1         PC2   HealthStatus
0    240.826942 -124.566713  Heart Disease
1    255.784424 -167.311924  Heart Disease
2    210.144072 -151.077370  Heart Disease
3    242.149530 -155.878586  Heart Disease
4    360.073340 -143.315018  Heart Disease
..          ...         ...            ...
298  248.540166  -99.950381        Healthy
299  269.522294 -114.518834        Healthy
300  201.130475 -114.116752        Healthy
301  138.338306  -91.005366        Healthy
302  242.698145 -150.742951        Healthy

[303 rows x 3 columns]


In [16]:
fig = px.scatter(X_y_df, x='PC1', y='PC2',color='HealthStatus')

fig.show()

In [17]:
# Testing 3-D Plotly

# data = datasets.load_digits()
X,y = heart_data.values[:,:-1], \
heart_data.values[:,-1]


y_label = []

for i in y:
  if i == 0:
    y_label.append('Healthy')
  else:
    y_label.append('Heart Disease')

HealthStatus = pd.DataFrame(y_label,columns=['HealthStatus'])

# Project the data onto the 2 primary principal components
pca = PCA(3)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2','PC3'])
X_y_df = df_scores = pd.concat([X_df, HealthStatus], axis=1)
print(X_y_df)

Shape of X: (303, 13)
Shape of transformed X: (303, 3)
            PC1         PC2         PC3   HealthStatus
0    240.826942 -124.566713 -157.868050  Heart Disease
1    255.784424 -167.311924 -143.776054  Heart Disease
2    210.144072 -151.077370 -144.657617  Heart Disease
3    242.149530 -155.878586 -135.853991  Heart Disease
4    360.073340 -143.315018 -128.036596  Heart Disease
..          ...         ...         ...            ...
298  248.540166  -99.950381 -148.272664        Healthy
299  269.522294 -114.518834 -117.315377        Healthy
300  201.130475 -114.116752 -158.385015        Healthy
301  138.338306  -91.005366 -143.025942        Healthy
302  242.698145 -150.742951 -145.289475        Healthy

[303 rows x 4 columns]


In [18]:
fig = px.scatter_3d(X_y_df, x='PC1', y='PC2', z='PC3',color='HealthStatus')

fig.show()

# Breast Cancer Dataset

In [19]:
#2-D Plotly

# data = datasets.load_digits()
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target



y_label = []

for i in y:
  if i == 0:
    y_label.append('Benign')
  else:
    y_label.append('Malignant')

CancerStatus = pd.DataFrame(y_label,columns=['CancerStatus'])

# Project the data onto the 2 primary principal components
pca = PCA(2)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2'])
X_y_df = df_scores = pd.concat([X_df, CancerStatus], axis=1)

Shape of X: (569, 30)
Shape of transformed X: (569, 2)


In [20]:
fig = px.scatter(X_y_df, x='PC1', y='PC2',color='CancerStatus')

fig.show()

In [21]:
#3-D Plotly

# data = datasets.load_digits()
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target



y_label = []

for i in y:
  if i == 0:
    y_label.append('Benign')
  else:
    y_label.append('Malignant')

CancerStatus = pd.DataFrame(y_label,columns=['CancerStatus'])

# Project the data onto the 3 primary principal components
pca = PCA(3)
pca.fit(X)
X_projected = pca.transform(X)

print("Shape of X:", X.shape)
print("Shape of transformed X:", X_projected.shape)


X_df = pd.DataFrame(X_projected,columns=['PC1','PC2','PC3'])
X_y_df = df_scores = pd.concat([X_df, CancerStatus], axis=1)

Shape of X: (569, 30)
Shape of transformed X: (569, 3)


In [22]:
fig = px.scatter_3d(X_y_df, x='PC1', y='PC2', z='PC3',color='CancerStatus')

fig.show()