In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()
alt.themes.enable('dark')

from sklearn import manifold
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
from umap import UMAP

from vega_datasets import data
import sklearn.datasets

In [2]:
df_iris = data.iris()
df_iris.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
alt.Chart(df_iris).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='species:N'
).properties(
    width=150,
    height=150
).repeat(
    row=['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'],
    column=['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']
).interactive()

# Projections

## TSNE

In [4]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)



In [5]:
features= df_iris.drop('species', axis=1)
features.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
%time tsne = tsne.fit(features)

Iteration   50, KL divergence  1.1364, 50 iterations in 1.8654 sec
Iteration  100, KL divergence  1.1078, 50 iterations in 1.6942 sec
Iteration  150, KL divergence  1.0306, 50 iterations in 1.6101 sec
Iteration  200, KL divergence  1.0568, 50 iterations in 1.5931 sec
Iteration  250, KL divergence  1.0770, 50 iterations in 1.7046 sec
Iteration   50, KL divergence  0.1480, 50 iterations in 1.7024 sec
Iteration  100, KL divergence  0.1315, 50 iterations in 1.5991 sec
Iteration  150, KL divergence  0.1284, 50 iterations in 1.7121 sec
Iteration  200, KL divergence  0.1270, 50 iterations in 1.5942 sec
Iteration  250, KL divergence  0.1263, 50 iterations in 1.5983 sec
Iteration  300, KL divergence  0.1255, 50 iterations in 1.6648 sec
Iteration  350, KL divergence  0.1255, 50 iterations in 1.6731 sec
Iteration  400, KL divergence  0.1250, 50 iterations in 1.6908 sec
Iteration  450, KL divergence  0.1247, 50 iterations in 1.5671 sec
Iteration  500, KL divergence  0.1247, 50 iterations in 1.5705

In [7]:
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])

## UMAP

In [8]:
reducer = UMAP()
%time umap = reducer.fit_transform(features)

CPU times: user 15 s, sys: 683 ms, total: 15.7 s
Wall time: 14 s


In [9]:
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])
df_iris_proj = pd.concat([df_iris, df_umap_coords, df_tsne_coords], axis='columns')
df_iris_proj.tail()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species,umapX,umapY,tsneX,tsneY
145,6.7,3.0,5.2,2.3,virginica,-2.555277,8.310998,-4.391177,-11.4656
146,6.3,2.5,5.0,1.9,virginica,-0.06224,8.967998,-1.166145,-10.687645
147,6.5,3.0,5.2,2.0,virginica,-2.350742,8.247149,-3.51885,-11.133073
148,6.2,3.4,5.4,2.3,virginica,-2.767353,8.711001,-4.634979,-12.404978
149,5.9,3.0,5.1,1.8,virginica,0.436351,9.433274,0.06925,-11.517223


## Results

In [10]:
alt.Chart(df_iris_proj).mark_circle(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() | alt.Chart(df_iris_proj).mark_circle(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive()

# Centroid Position = mean of projected x/y coordinates

In [11]:
proj_mean = df_iris_proj.groupby(['species']).mean().reset_index()
proj_mean

Unnamed: 0,species,sepalLength,sepalWidth,petalLength,petalWidth,umapX,umapY,tsneX,tsneY
0,setosa,5.006,3.428,1.462,0.246,20.395897,-1.042336,0.497487,20.005132
1,versicolor,5.936,2.77,4.26,1.326,1.792062,7.286421,3.458663,-9.090078
2,virginica,6.588,2.974,5.552,2.026,-2.137335,8.676709,-3.95615,-10.915054


In [12]:
alt.Chart(df_iris_proj).mark_circle(
    opacity=0.4
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() + alt.Chart(proj_mean).mark_circle(
    size=150
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() + alt.Chart(pd.DataFrame(df_iris_proj.mean()).transpose()).mark_circle(
    size=150,
    color='black',
    opacity=0.9
).encode(
    x='umapX:Q',
    y='umapY:Q'
).properties(
    width=500,
    height=400,
).interactive() | alt.Chart(df_iris_proj).mark_circle(
    opacity=0.4
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive() + alt.Chart(proj_mean).mark_circle(
    size=150,
    opacity=0.9
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() + alt.Chart(pd.DataFrame(df_iris_proj.mean()).transpose()).mark_circle(
    size=150,
    opacity=0.9,
    color='black'
).encode(
    x='tsneX:Q',
    y='tsneY:Q'
).properties(
    width=500,
    height=400,
).interactive()

# Centroid Position = mean of high dimensional data

Drawback: needs out of sample extension or a new projection

In [13]:
df_iris.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [14]:
df_iris.append(df_iris.mean().append(pd.Series(['mean'], index=['species'])), ignore_index=True)

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.100000,3.500000,1.400,0.200000,setosa
1,4.900000,3.000000,1.400,0.200000,setosa
2,4.700000,3.200000,1.300,0.200000,setosa
3,4.600000,3.100000,1.500,0.200000,setosa
4,5.000000,3.600000,1.400,0.200000,setosa
...,...,...,...,...,...
146,6.300000,2.500000,5.000,1.900000,virginica
147,6.500000,3.000000,5.200,2.000000,virginica
148,6.200000,3.400000,5.400,2.300000,virginica
149,5.900000,3.000000,5.100,1.800000,virginica


In [15]:
species_means = df_iris.groupby('species').mean().reset_index()
# species_means['species'] = species_means['species'] + '_mean'

In [16]:
df_iris_w_means = df_iris \
    .append(species_means) \
    .append(df_iris.mean().append(pd.Series(['mean'], index=['species'])), ignore_index=True)
df_iris_w_means

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.100000,3.500000,1.400,0.200000,setosa
1,4.900000,3.000000,1.400,0.200000,setosa
2,4.700000,3.200000,1.300,0.200000,setosa
3,4.600000,3.100000,1.500,0.200000,setosa
4,5.000000,3.600000,1.400,0.200000,setosa
...,...,...,...,...,...
149,5.900000,3.000000,5.100,1.800000,virginica
150,5.006000,3.428000,1.462,0.246000,setosa
151,5.936000,2.770000,4.260,1.326000,versicolor
152,6.588000,2.974000,5.552,2.026000,virginica


In [17]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)

features= df_iris_w_means.drop('species', axis=1)

tsne = tsne.fit(features)
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])



Iteration   50, KL divergence  1.1869, 50 iterations in 1.8048 sec
Iteration  100, KL divergence  1.1276, 50 iterations in 1.6217 sec
Iteration  150, KL divergence  1.0491, 50 iterations in 1.6027 sec
Iteration  200, KL divergence  1.0810, 50 iterations in 1.6147 sec
Iteration  250, KL divergence  1.1053, 50 iterations in 1.6122 sec
Iteration   50, KL divergence  0.1551, 50 iterations in 1.5981 sec
Iteration  100, KL divergence  0.1396, 50 iterations in 1.5473 sec
Iteration  150, KL divergence  0.1362, 50 iterations in 1.7085 sec
Iteration  200, KL divergence  0.1347, 50 iterations in 1.6110 sec
Iteration  250, KL divergence  0.1343, 50 iterations in 1.6561 sec
Iteration  300, KL divergence  0.1330, 50 iterations in 1.6148 sec
Iteration  350, KL divergence  0.1326, 50 iterations in 1.8830 sec
Iteration  400, KL divergence  0.1328, 50 iterations in 1.7355 sec
Iteration  450, KL divergence  0.1324, 50 iterations in 1.6282 sec
Iteration  500, KL divergence  0.1324, 50 iterations in 1.6890

In [18]:
reducer = UMAP()
umap = reducer.fit_transform(features)
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])
df_iris_proj_means = pd.concat([df_iris_w_means, df_umap_coords, df_tsne_coords], axis='columns')

In [19]:
df_iris_proj_means[150:]

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species,umapX,umapY,tsneX,tsneY
150,5.006,3.428,1.462,0.246,setosa,20.746401,10.795159,18.360677,6.68434
151,5.936,2.77,4.26,1.326,versicolor,6.320678,4.433393,-9.979807,0.791729
152,6.588,2.974,5.552,2.026,virginica,0.81139,7.273107,-8.503479,-7.764326
153,5.843333,3.057333,3.758,1.199333,mean,7.023718,4.264944,-10.819586,2.151054


In [20]:
alt.Chart(df_iris_proj_means[:150]).mark_circle(
    opacity=0.4
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() + alt.Chart(df_iris_proj_means[150:]).mark_circle(
    size=150,
    opacity=0.9
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() | alt.Chart(df_iris_proj_means[:150]).mark_circle(
    opacity=0.4
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive() + alt.Chart(df_iris_proj_means[150:]).mark_circle(
    size=150,
    opacity=0.9
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive()