# PCA scattering of ORL Faces
stough 202-

A look at PCA to scatter the [ORL faces database](http://cam-orl.co.uk/facedatabase.html)

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.offsetbox import (OffsetImage,
                                  AnnotationBbox)
from sklearn.decomposition import PCA

from torchvision import transforms
from torchvision.datasets import ImageFolder
NUMFACES = 20 # Number of random faces to display in the first scatterplot
NUMSUBJECTS = 3 # Number of subjects to display in the subject-specific scatter

imshape = (112, 92)

In [None]:
orl_faces = ImageFolder('/home/dip365/data/ORL/', 
                       transform=transforms.Compose([
                           transforms.Grayscale(),
                           transforms.ToTensor(),
                           transforms.Lambda(lambda x: np.array(x).ravel())
                       ]))

faces = np.stack([orl_faces[i][0] 
                  for i in np.random.choice(len(orl_faces), NUMFACES)])
allfaces = np.stack([orl_faces[i][0] for i in range(len(orl_faces))])

In [None]:
faces.shape, allfaces.shape

In [None]:
# Now get 10 most important 10304-d directions in the space of faces.
pca = PCA(n_components=10)
Xp = pca.fit_transform(allfaces)
# Xp is actually 320x10 here: each of the 320 training images is now
# projected into the 10-d pca space.
print('Explained variation per principal component: \n{}'.\
      format(pca.explained_variance_ratio_))


f, ax = plt.subplots(1,3, figsize=(8,3), sharey=True, sharex=True)
f.canvas.set_window_title('Average Face and Two Pricipal Components')
ax[0].imshow(np.reshape(allfaces.mean(axis=0), imshape), cmap='gray')
ax[0].set_title('Average Face')

# pca.components_ is 10 x 10304 here, representing the most meaningful
# directions in the 10304-d space of face images.
ax[1].imshow(np.reshape(pca.components_[0,:], imshape), cmap='gray')
ax[1].set_title('PCA 1');

ax[2].imshow(np.reshape(pca.components_[1,:], imshape), cmap='gray')
ax[2].set_title('PCA 2');

In [None]:
# Now scatter some random faces in the pca dimensions.
# Xp from above is already 320 x 10, the 10-pca dimensional
# projection of each image.

# Which images to display is randomized, so that's it's different each time run.
np.random.seed() # Comment out this line to ensure the same images are displayed each run.
whichfaces = np.random.choice(len(allfaces), NUMFACES, replace=False)
xys = Xp[whichfaces, :]


f2, ax2 = plt.subplots(figsize=(5,5))
f2.canvas.set_window_title('Face PCA Projection Scatter')

"""
How to scatter images in a plot, instead of points for example:
https://matplotlib.org/examples/pylab_examples/demo_annotation_box.html
Also useful:
https://stackoverflow.com/questions/22566284/matplotlib-how-to-plot-images-instead-of-points?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
and:
https://stackoverflow.com/questions/48896088/matplotlib-plotting-images-instead-of-points-stretches-images
"""
for i, xy in zip(whichfaces, xys):
    arr_img = np.reshape(allfaces[i, :], imshape)

    imagebox = OffsetImage(arr_img, zoom=0.4, cmap='gray')
    imagebox.image.axes = ax2

    # xy is 10-d here, just need the first 2 to plot though.
    ab = AnnotationBbox(imagebox, xy[:2],
                        pad=0.2,
                        )

    ax2.add_artist(ab)

ax2.set_xlim(xys[:,0].min(), xys[:,0].max())
ax2.set_ylim(xys[:,1].min(), xys[:,1].max())
plt.show()

In [None]:
# Now, scatter just NUMSUBJECTS different subjects in the space,
# see if they cluster nicely.
whichsubjects = np.random.choice(np.arange(1, len(orl_faces.classes)+1),
                                 NUMSUBJECTS, replace=False)

alltargets = np.stack([orl_faces[i][1] for i in range(len(orl_faces))])

f3, ax3 = plt.subplots()

for i, label in enumerate(alltargets):
    if label in whichsubjects:
        arr_img = np.reshape(allfaces[i, :], imshape)

        imagebox = OffsetImage(arr_img, zoom=0.4, cmap='gray')
        imagebox.image.axes = ax3

        ab = AnnotationBbox(imagebox, Xp[i,:2],
                            pad=0.2,
                            )

        ax3.add_artist(ab)

ax3.set_xlim(Xp[:,0].min(), Xp[:,0].max())
ax3.set_ylim(Xp[:,1].min(), Xp[:,1].max())
plt.suptitle(f'Face PCA Scatter for Subjects {whichsubjects}')
plt.show()

&nbsp;

### Let's add a t-sne visualization.

In [None]:
# Ripped from: https://medium.com/@luckylwk/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b
from sklearn.manifold import TSNE

mycolors = .2 + .9*np.random.rand(40,3)
mycolors = np.clip(mycolors, 0, 1)

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(Xp)

# Legend: https://matplotlib.org/users/legend_guide.html#creating-artists-specifically-for-adding-to-the-legend-aka-proxy-artists
import matplotlib.patches as mpatches
patch_list = [mpatches.Patch(color=mycolors[i], label=str(i)) for i in range(len(mycolors))]

f, ax = plt.subplots(1,1, figsize=(8,5))

ax.scatter(tsne_results[:,0], tsne_results[:,1], c = mycolors[alltargets], alpha=.75)
# plt.legend(handles=patch_list)

# Legend positioning: https://pythonspot.com/matplotlib-legend/
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(handles=patch_list, loc='upper center', bbox_to_anchor=(1.35, 1.1), shadow=True, ncol=2);

# plt.legend(handles=patch_list, loc='upper right', bbox_to_anchor=(1.0, 0.5), shadow=True, ncol=2)