In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
# Many ML problems - thousands/millions of features for each training instance
# - makes training slow and also harder to find a good solutions - referred to: curse of dimensionality

# Solution - to reduce the dimensionality of the data by removing less important parts
# - it does lose some information which results in slightly worse end result
# - also makes the pipeline more complex and harder to maintain
# - used when training is too slow
# - on the plus side: DR may filter out some noise and unnecessary details and result in higher performance (but not in general)
# - main purpose is to speed up training

# Additionaly, DR can be useful for data visualization - reducing dimensions down to 2/3 allows it to plot a condensed view of high-dimensional training set
# -> can help detecting any patterns, such as clusters
# - also DataViz - essential to communicate your conclusions to people who are not data scientists\

In [4]:
# Curse of Dimensionality
# Behaviours are very different in high-dimensional space:
# e.g. if you pick two points randomly in a unit square, the distance between them will be, on average, roughly 0.52
# - if you do it in 3D - roughly 0.66
# - 1 000 000 dimensional cube -> average distance about 408.25
# -> how can two points be so far apart when they both lie within the same unit cube???
# --> implies that high dimensional datasets are at rish of being very sparse - most trainin instances are likely to be far awy from each other
#   - makes prediction less reliable than in lower dimensions, since they are based on much larger extrapolations

# The more dimension the training set has -> the greater risk of overfitting
# Could solve with more training instaces but sadly the given density grows exponentially with the number of dimensions

In [5]:
# 2 Main approaches for Dimensionality Reduction:
# 1) projections
# 2) Manifold Learning

In [6]:
# Projections
# in most real-world problems: training instances are not spread out uniformly across all dimensions
# many feaures are almost constant, while oters are hihgly correlated (e.g. MNIST)
# -> all training instances actually lie within (or close to) a much lower-dimensional subspace pf the high-D space

# Downsides:
# - if the subspace twists and turns (Swiss roll toy dataset)
# -> by projecting onto a plane -> squash different layers of the Swiss roll toghether
# -> you want to unroll the Swiss roll to obtain the 2D dataset that is representative of the actual data

In [None]:
# Manifold Learning
# - Swiss roll - an example of 2D manifold (= shape that can be bent and twisted in a higher-D space)
# d-dimensional manifold is part of an n-dimensional space (d < n) that locally resembles a d-dimensional hyperplane
# - in case of a Swiss roll, d=2 and n=3: it locally resembles a 2D plane, but it is rolled in the third dimension

# Many DR algorithms worrk by modelling the manifold on which the training instances lie - Manifold Learning
# - relies on manifold assumption (manifold hypothesis) - most real-world high-dimensional datasets lie close to a much lower-dimensional manifold
# (often empirically observed)
# E.g. for MNIST data set (connected lines, borders white, centered, etc.) -> if you randomly generate images, only a few would look like that
# -> DOF for creating a digit <<<< than DOF if you want to creat any image - this constraint tend to squeeze the dataset int a lower d-manifold

# Manifold assumtion often accompanied by another implicit assumption:
# - task at hand will be simpler if express in the lower-dimensional space of the manifold
# * need to be careful since it does not always hold

# Reducing the dimensionality - usually speeds up the training but it may not always lead to better or simpler solutions - depends on the dataset