# Dimensionality Reduction Playground
No clear goal, just play around with different methods and see what each method gives, how long does it take to run...

## Imports

In [1]:
from sklearn.datasets import load_digits

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

from sklearn.decomposition import PCA, KernelPCA 
from sklearn.manifold import Isomap, MDS, TSNE

from tqdm import tqdm
from time import time

## Constants

In [2]:
PCA_COMPONENTS = range(1,21)
KERNEL_PCA_PARAMS = {
    'kernel' : ["poly", "rbf", "sigmoid"],
    'n_components': range(1,21)
}
MDS_PARAMS = {'n_components': range(1,21)}
ISOMAP_PARAMS = {
    'n_components': range(1,21),
    'n_neighbors': [2,5,10]
}
TSNE_PARAMS = {
    'n_components': range(1,4)
}

## Code

### Data Loading

In [3]:
# Load data
digits = load_digits()
X = digits.data
y= digits.target

### Dimentionality Reduction

#### PCA

In [4]:
# PCA
pca = PCA(n_components=max(PCA_COMPONENTS))
t_start = time()
pca_digits_data = pca.fit_transform(X)
print(f"PCA took {1000*(time()-t_start)} ms")
pca_digits_df = pd.DataFrame(data=pca_digits_data, columns=[f'PC{i}' for i in PCA_COMPONENTS])
pca_digits_df['target'] = y

PCA took 22.02773094177246 ms


In [5]:
# Kind of a scree plot (instead of eigenvalue I use explained variance because it is a more clear parameter) - left
# Total explained variance per numebr of components - right

fig = make_subplots(rows=1, cols=2,subplot_titles=("Explained Variance per PC", 
                                                   "Total Variance per N_Comp"))

fig.add_trace(go.Scatter(x=[f'PC {i}' for i in PCA_COMPONENTS], 
                         y=pca.explained_variance_ratio_),
              row=1,col=1)
fig.add_trace(go.Scatter(x=list(PCA_COMPONENTS), 
                         y=[sum(pca.explained_variance_ratio_[:n_comp]) for n_comp in PCA_COMPONENTS]), 
              row=1,col=2)

# Update axis properties
fig.update_xaxes(title_text='PC Number', row=1, col=1)
fig.update_yaxes(title_text='Explained Variance', row=1, col=1)
fig.update_xaxes(title_text='Number of Components', row=1, col=2)
fig.update_yaxes(title_text='Explained Variance', row=1, col=2)

fig.show()

In [6]:
fig = px.scatter_3d(pca_digits_df, x=pca_digits_df.columns[0], 
                 y=pca_digits_df.columns[1], z=pca_digits_df.columns[2],
                 color=pca_digits_df.target)
fig.show()

In [15]:
fig = px.scatter(pca_digits_df, x=pca_digits_df.columns[0], 
                 y=pca_digits_df.columns[1],
                 color=pca_digits_df.target)
fig.show()

#### KPCA

In [8]:
kpca_digits_data = pd.DataFrame()
results = {}

for k in KERNEL_PCA_PARAMS['kernel']:
    kpca = KernelPCA(n_components=max(KERNEL_PCA_PARAMS['n_components']), kernel=k)
    t_start = time()
    kpca_curr_digits_data = kpca.fit_transform(X)
    print(f"KPCA(kernel={k}) took {1000*(time()-t_start)} ms")
    
    explained_variance = np.var(kpca_curr_digits_data, axis=0)
    explained_variance_ratio = explained_variance / np.sum(explained_variance)
    results[k] = explained_variance_ratio
    
    kpca_curr_digits_data = pd.DataFrame(data=kpca_curr_digits_data, 
                                         columns=[f'PC{i}_{k}' for i in KERNEL_PCA_PARAMS['n_components']])
    kpca_digits_data = pd.concat([kpca_digits_data, kpca_curr_digits_data], axis=1)
kpca_digits_data['target'] = y

KPCA(kernel=poly) took 567.741870880127 ms
KPCA(kernel=rbf) took 281.10241889953613 ms
KPCA(kernel=sigmoid) took 212.07261085510254 ms


In [9]:
# Kind of a scree plot (instead of eigenvalue I use explained variance because it is a more clear parameter) - left
# Total explained variance per numebr of components - right

fig = make_subplots(rows=3, cols=2, horizontal_spacing= 0.2, vertical_spacing = 0.25,
                    subplot_titles=("Explained Variance per PC (kernel=poly)", 
                                    "Total Variance per N_Comp (kernel=poly)",
                                    "Explained Variance per PC (kernel=rbf)", 
                                    "Total Variance per N_Comp (kernel=rbf)",
                                    "Explained Variance per PC (kernel=sigmoid)", 
                                    "Total Variance per N_Comp (kernel=sigmoid)"))

for row, kernel in zip(range(1,4),KERNEL_PCA_PARAMS['kernel']):
    
    fig.add_trace(go.Scatter(x=[f'PC {i}' for i in KERNEL_PCA_PARAMS['n_components']], 
                         y=results[kernel]),
              row=row,col=1)
    fig.add_trace(go.Scatter(x=list(KERNEL_PCA_PARAMS['n_components']), 
                         y=[sum(results[kernel][:n_comp]) for n_comp in KERNEL_PCA_PARAMS['n_components']]), 
              row=row,col=2)
    
    fig.update_xaxes(title_text='PC Number', row=row, col=1)
    fig.update_yaxes(title_text='Explained Variance', row=row, col=1)
    fig.update_xaxes(title_text='Number of Components', row=row, col=2)
    fig.update_yaxes(title_text='Explained Variance', row=row, col=2)

fig.show()

In [10]:
filtered_df = kpca_digits_data[[f'PC{i}_poly' for i in KERNEL_PCA_PARAMS['n_components']]+['target']]# Using the best kernel (poly)

fig = px.scatter_3d(filtered_df, x=filtered_df.columns[0], 
                 y=filtered_df.columns[1], z=filtered_df.columns[2],
                 color=filtered_df.target)
fig.show()

In [14]:
fig = px.scatter(filtered_df, x=filtered_df.columns[0], 
                 y=filtered_df.columns[1],
                 color=filtered_df.target)
fig.show()

#### MDS

In [16]:
MDS_digits_data = pd.DataFrame()

mds = MDS(n_components=max(MDS_PARAMS['n_components']))
t_start = time()
MDS_digits_data = mds.fit_transform(X)
print(f"MDS took {1000*(time()-t_start)} ms")    
MDS_digits_data = pd.DataFrame(data=MDS_digits_data,
                               columns=[i for i in MDS_PARAMS['n_components']])
MDS_digits_data['target'] = y

MDS took 71017.8325176239 ms


In [17]:
fig = px.scatter_3d(MDS_digits_data, x=MDS_digits_data.columns[0], 
                 y=MDS_digits_data.columns[1], z=MDS_digits_data.columns[2],
                 color=MDS_digits_data.target)
fig.show()

In [19]:
fig = px.scatter(pca_digits_df, x=pca_digits_df.columns[0], 
                 y=pca_digits_df.columns[1],
                 color=pca_digits_df.target)
fig.show()

#### isomap

In [20]:
isomap_digits_data = pd.DataFrame()

for nn in ISOMAP_PARAMS['n_neighbors']:
    iso = Isomap(n_components=max(ISOMAP_PARAMS['n_components']), 
                 n_neighbors=nn)
    t_start = time()
    iso_curr_digits_data = iso.fit_transform(X)
    print(f"isomap(n_neighbors={nn}) took {1000*(time()-t_start)} ms")
    
    iso_curr_digits_data = pd.DataFrame(data=iso_curr_digits_data,
                                        columns=[f'{i}_{nn}neighbors' for i in ISOMAP_PARAMS['n_components']])
    isomap_digits_data = pd.concat([isomap_digits_data, iso_curr_digits_data], axis=1)
isomap_digits_data['target'] = y

isomap(n_neighbors=2) took 1496.7660903930664 ms
isomap(n_neighbors=5) took 2400.4080295562744 ms
isomap(n_neighbors=10) took 2917.093276977539 ms


In [21]:
filtered_df = isomap_digits_data[[f'{i}_2neighbors' for i in ISOMAP_PARAMS['n_components']]+['target']]

fig = px.scatter(filtered_df, x=filtered_df.columns[0], 
                 y=filtered_df.columns[1],
                 color=filtered_df.target)
fig.show()

In [22]:
filtered_df = isomap_digits_data[[f'{i}_5neighbors' for i in ISOMAP_PARAMS['n_components']]+['target']]

fig = px.scatter(filtered_df, x=filtered_df.columns[0], 
                 y=filtered_df.columns[1],
                 color=filtered_df.target)
fig.show()

In [23]:
filtered_df = isomap_digits_data[[f'{i}_10neighbors' for i in ISOMAP_PARAMS['n_components']]+['target']]

fig = px.scatter(filtered_df, x=filtered_df.columns[0], 
                 y=filtered_df.columns[1],
                 color=filtered_df.target)
fig.show()

#### TSNE

In [24]:
tsne = TSNE()#n_components=max(TSNE_PARAMS['n_components']), learning_rate=1000, perplexity=100)
t_start = time()
tsne_digits_data = tsne.fit_transform(X)
print(f"TSNE took {1000*(time()-t_start)} ms")
tsne_digits_data = pd.DataFrame(data=tsne_digits_data)#, columns=[f'{i}' for i in TSNE_PARAMS['n_components']])
tsne_digits_data['target'] = y

TSNE took 6173.321962356567 ms


In [25]:
fig = px.scatter(tsne_digits_data, x=tsne_digits_data.columns[0], 
                 y=tsne_digits_data.columns[1],
                 color=tsne_digits_data.target)
fig.show()

### Nothing Important

In [22]:
px.imshow(digits.images[2], color_continuous_scale='Greys')