# Principal Component Analysis (PCA)

In [None]:
from tensorflow.keras.datasets import mnist
from matplotlib import pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
(xtrain, ytrain), (xtest, ytest) = mnist.load_data()

#### And let's write a function that takes in numpy arrays of images and renders/plots the first 40 of them:

In [None]:
def draw_array(x):
    plt.figure(figsize=(12,7))
    for i in range(16):
        plt.subplot(5, 8, i+1)
        plt.imshow(x[i], cmap=plt.cm.Greys)
        plt.axis('off')

draw_array(xtrain)

### Reshape data

In [None]:
xtrain = xtrain.reshape((60000, 28*28))
xtrain.shape

### Select first 1000 images (to speed up the calculation)

In [None]:
xsmall = xtrain[:1000]
ysmall = ytrain[:1000]

In [None]:
xsmall.shape, ysmall.shape

### 3. Initialize PCA from Scikit-Learn and fit on X data
- By how many components would we like to decompose our data?
- Caveat: Data should be standard scaled first!

In [None]:
scaler = StandardScaler()
xsmall = scaler.fit_transform(xsmall)

In [None]:
m = PCA(n_components = 40)
m.fit(xsmall)

In [None]:
# each component has 784 coefficients (corresponding to 28x28 pixels)
m.components_.shape 

### Percentage of variation in the data over principal components

In [None]:
m.explained_variance_ratio_

In [None]:
cumsum = np.array(m.explained_variance_ratio_).cumsum()
plt.plot(cumsum)

### Transform data to a lower number of features.

In [None]:
xt = m.transform(xtrain) #transform the training data
xt.shape #transformed to 50 features instead of the original 784

In [None]:
xback = m.inverse_transform(xt)
print(xback.shape)
xback = xback.reshape((60000, 28, 28))

In [None]:
draw_array(xback)

### Can we actually see the components?

In [None]:
m.components_.shape

In [None]:
comps = m.components_.reshape((40, 28, 28)) #convert back to original format
draw_array(comps)

### PCA as a preprocessing step in Supervised Learning

In [None]:
scaler = StandardScaler()
Xscaled = scaler.fit_transform(xt[:1000])

In [None]:
logreg = LogisticRegression()
logreg.fit(Xscaled, ytrain[:1000])

print(f'Model Score: {logreg.score(Xscaled, ytrain[:1000])}')