## PCA: Princial Compoenent Analysis

### MSRI-UP 2023

Authors: Luis Scoccola and Jose Perea

Date: 6/14/2023

---

In [None]:
# from the datasets module of scikit learn, import the load_digits function
from sklearn.datasets import load_digits

# from scikit learn, import Principal Components Analysis
from sklearn.decomposition import PCA

import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt
%matplotlib inline


## Digits example

In [None]:
# Import the digits dataset
digits = load_digits()

In [None]:
# the variable digits is a dictionary. Let's see what keys it has
print(list(digits.keys()))

In [None]:
# we can inspect the value of each of these keys. For instance, if we print the value for data, we get a pointcloud of 1797 points in R^64
digits['data'].shape

In [None]:
# the value of target contains the label for each of the 1797 images
print(digits['target'].shape)

# let's display the label for the first 25 digits
print(digits['target'][:25])

In [None]:
# The images themselves are the value of the "images" keys:
digits['images'].shape

# we see that there are 1797 elements, each consisting of a 8 by 8 vector/matrix

In [None]:
# let us display the first 25 images, with their corresponding label

fig = plt.figure(figsize=(5, 5))

for i in range(25):
    ax = fig.add_subplot(5, 5, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[i], cmap=plt.cm.binary)
    ax.text(0, 7, str(digits.target[i]))

plt.figure()

In [None]:
# Let's plot the first three coordinates of the data

# Plot the data 
fig = go.Figure(data=[go.Scatter3d(
    x=digits.data[:,0], y=digits.data[:,1], z=digits.data[:,2], 
    mode ='markers', 
    marker=dict(size = 3 , color = digits.target ))])

fig.show()


In [None]:
# let's display the first two principal components of each element in the pointcloud

# we construct a PCA object with 2 components
pca = PCA(n_components=2)

# we compute the 2D projection of the digits data
proj2d = pca.fit_transform(digits.data)

# we display the projection, coloring each point by its label
plt.scatter(proj2d[:, 0], proj2d[:, 1], c=digits.target, cmap="Paired")
plt.colorbar()

In [None]:
# let us now do a 2D projection of just one of the classes, 
# to try to interpret what the 2 principal components are recovering in that case

# find the indices for the images that contain a 1
indices = np.where(digits['target']==1)[0]

# project to 2D only the 1's
pca = PCA(n_components=2)
proj = pca.fit_transform(digits.data[indices])
plt.scatter(proj[:,0],proj[:,1])
plt.show()

In [None]:
# there seems to be a fair amount of structure in the plot above.
# for instance, there seem to be two distinct clusters, one with significantly fewer points. 
# The larger cluster seems also have nontrivial structure.
# let's now plot some of the digits on top of their corresponding point in the 2D embedding, 
# to try to understand what the 2 principal components are capturing

# plotting all the images would be too much, so let us just plot 100
subsample = np.random.choice(len(indices), 100)

from matplotlib.offsetbox import OffsetImage, AnnotationBbox

fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(proj[subsample,0], proj[subsample,1]) 

for x0, y0, im in zip(proj[subsample,0], proj[subsample,1],digits["images"][indices][subsample]):
    ab = AnnotationBbox(OffsetImage(im, cmap='binary',zoom=2), (x0, y0))
    ax.add_artist(ab)

## Mystery Data 3

In [None]:
data = np.loadtxt('./data/Data_3.txt', delimiter =',')
data.shape

In [None]:
pca = PCA(n_components=5)
data_pca = pca.fit(data).transform(data)

print('Explained Variance', pca.explained_variance_ratio_)

fig = go.Figure(data=[go.Scatter3d(
    x=data_pca[:,0], y=data_pca[:,1], z=data_pca[:,2], 
    mode ='markers', 
    marker=dict(size = 1.5, color = 'grey')
)])

fig.show()