<a href="https://colab.research.google.com/github/jackiekuen2/notes-handson-ml-tf/blob/master/ch8_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

In [0]:
# Build a random 3D dataset
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [0]:
X[:5]

array([[-1.01570027, -0.55091331, -0.26132626],
       [-0.00771675,  0.59958572,  0.03507755],
       [-0.95317135, -0.46453691, -0.24920288],
       [-0.92012304,  0.21009593,  0.02182381],
       [-0.76309739,  0.158261  ,  0.19152496]])

# B1. PCA

In [0]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [0]:
# Decompose 3D dataset X into 2D dataset X2D
X2D[:5]

array([[ 1.26203346,  0.42067648],
       [-0.08001485, -0.35272239],
       [ 1.17545763,  0.36085729],
       [ 0.89305601, -0.30862856],
       [ 0.73016287, -0.25404049]])

In [0]:
# Extract the principal components (PCs)
c1 = pca.components_.T[:, 0]
c2 = pca.components_.T[:, 1]

print("1st PC: {} \n2nd PC: {} \nThey are column vectors with shape: {}".format(c1, c2, c1.shape))

1st PC: [-0.93636116 -0.29854881 -0.18465208] 
2nd PC: [ 0.34027485 -0.90119108 -0.2684542 ] 
They are column vectors with shape: (3,)


In [0]:
# Explained Variance Ratio
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

84.2% of dataset's variance lies along the 1st axis, and 14.6% lies along the 2nd axis; remaining 1.2% lies for the 3rd axis (Carry little information)

## MNIST Compression

In [0]:
from six.moves import urllib
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(np.int64)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

In [0]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Method 1. Find the right number (d) of dimensions to reduce down to --> Then set n_components=d

In [0]:
# Method 1
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)

d = np.argmax(cumsum >= 0.95) + 1

In [0]:
print("Original dimensions: %s" % X.shape[1])
print("The optimal dimensions to reduce down to: %s" % d)

Original dimensions: 784
The optimal dimensions to reduce down to: 154


### Method 2. Set n_components to below 1.0 --> Target variance ratio to preserve

In [0]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [0]:
print("New dimensions: %s" % X_reduced.shape[1])

New dimensions: 154


## Recover compressed datasets

In [0]:
X_recovered = pca.inverse_transform(X_reduced)

In [0]:
print("Recovered dimensions: %s" % X_recovered.shape[1])

Recovered dimensions: 784


## Incremental PCA

In [0]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

In [0]:
X_reduced.shape

(52500, 154)

# B2. Kernel PCA

In [0]:
from sklearn.datasets import make_swiss_roll

X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
y = t > 6.9

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA

clf = Pipeline([
                ('kpca', KernelPCA(n_components=2)),
                ('log_reg', LogisticRegression(solver='liblinear'))
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ['rbf', 'sigmoid']
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('kpca',
                                        KernelPCA(alpha=1.0, coef0=1,
                                                  copy_X=True, degree=3,
                                                  eigen_solver='auto',
                                                  fit_inverse_transform=False,
                                                  gamma=None, kernel='linear',
                                                  kernel_params=None,
                                                  max_iter=None, n_components=2,
                                                  n_jobs=None,
                                                  random_state=None,
                                                  remove_zero_eig=False,
                                                  tol=0)),
                                       ('log_reg',
                                 

In [0]:
print(grid_search.best_params_)

{'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}


## Reconstruction from Kernel PCA

In [0]:
rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.0433, fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [0]:
from sklearn.metrics import mean_squared_error

mean_squared_error(X, X_preimage) # Pre-image reconstruction error

32.78630879576613

End