# SDA - lecture 11 - Dimensionality reduction

In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s')

import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

%matplotlib widget
# %matplotlib inline

## PCA - Simple 2D --> 1D example

For a documentation of Python implementation of PCA, see:  
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
# Generate the 2D dataset
K = 1000
x1 = np.random.normal(loc=3, scale=4, size=K)
x2 = 0.5 * x1 - np.random.normal(loc=1, scale=2, size=K)
X = np.concatenate((x1.reshape(-1,1),x2.reshape(-1,1)),axis=1)

def drawScatter(X, axis):
    axis.plot(X[:,0],X[:,1],'.')
    axis.set_xlabel('x1')
    axis.set_ylabel('x2')
    ext = max((abs(X.min()), abs(X.max())))+1
    axis.set(xlim=(-ext, ext), ylim=(-ext, ext))
    
fig, ax = plt.subplots(figsize=(5,5), nrows=1, ncols=1)
drawScatter(X, ax)
#plt.savefig(os.path.join('figs','SDA11-OriginalData.png'))

In [None]:
pca = PCA()
pca.fit(X)
logging.info(f'Singular value 1: {pca.singular_values_[0]:.2f}\t' + 
             f'Singular value 2: {pca.singular_values_[1]:.2f}')
logging.info(f'Fraction of variance 1: {pca.explained_variance_ratio_[0]:.2f}\t' + 
             f'Fraction of variance 2: {pca.explained_variance_ratio_[1]:.2f}')
# Note: explained variance ratio == singular values squared divided by the sum of singular values squared

fig, ax = plt.subplots(figsize=(10,4), nrows=1, ncols=2)
drawScatter(X, ax[0])
for i in [0,1]:
    ax[0].plot([pca.mean_[0]-np.sqrt(pca.explained_variance_[i])*pca.components_[i,0], 
                  pca.mean_[0]+np.sqrt(pca.explained_variance_[i])*pca.components_[i,0]], 
                 [pca.mean_[1]-np.sqrt(pca.explained_variance_[i])*pca.components_[i,1], 
                  pca.mean_[1]+np.sqrt(pca.explained_variance_[i])*pca.components_[i,1]], 'k')

v = pca.explained_variance_ / pca.explained_variance_.sum()
cax=ax[1]
xv = np.arange(1,v.shape[0]+1)
cax.bar(xv,v)
cax.set_title('Variancle fraction exaplained by each PC')
cax.set_xticks(xv);

In [None]:
pca = PCA(n_components=1)
pca.fit(X)

fig, ax = plt.subplots(figsize=(10,4), nrows=1, ncols=2)
drawScatter(X, ax[0])
ax[0].set_title('Original data')
Y = pca.transform(X)
X_rec = pca.inverse_transform(Y)
drawScatter(X_rec, ax[1])
ax[1].set_title('Projection on 1st PC');