In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from sklearn import manifold
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
from umap import UMAP

from vega_datasets import data
import sklearn.datasets

In [3]:
df_iris = data.iris()
df_iris.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
alt.Chart(df_iris).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='species:N'
).properties(
    width=150,
    height=150
).repeat(
    row=['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'],
    column=['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']
).interactive()

# Projections

## TSNE

In [5]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)



In [6]:
features= df_iris.drop('species', axis=1)
features.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
%time tsne = tsne.fit(features)

Iteration   50, KL divergence  1.1364, 50 iterations in 1.9022 sec
Iteration  100, KL divergence  1.1078, 50 iterations in 1.7765 sec
Iteration  150, KL divergence  1.0306, 50 iterations in 1.6731 sec
Iteration  200, KL divergence  1.0568, 50 iterations in 1.6121 sec
Iteration  250, KL divergence  1.0770, 50 iterations in 1.6388 sec
Iteration   50, KL divergence  0.1480, 50 iterations in 1.5358 sec
Iteration  100, KL divergence  0.1315, 50 iterations in 1.5423 sec
Iteration  150, KL divergence  0.1284, 50 iterations in 1.6647 sec
Iteration  200, KL divergence  0.1270, 50 iterations in 1.5258 sec
Iteration  250, KL divergence  0.1263, 50 iterations in 1.5431 sec
Iteration  300, KL divergence  0.1255, 50 iterations in 1.6342 sec
Iteration  350, KL divergence  0.1255, 50 iterations in 1.5381 sec
Iteration  400, KL divergence  0.1250, 50 iterations in 1.6032 sec
Iteration  450, KL divergence  0.1247, 50 iterations in 1.5122 sec
Iteration  500, KL divergence  0.1247, 50 iterations in 1.5849

In [8]:
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])

## UMAP

In [9]:
reducer = UMAP()
%time umap = reducer.fit_transform(features)

CPU times: user 15.2 s, sys: 651 ms, total: 15.8 s
Wall time: 14.1 s


In [10]:
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])
df_iris_proj = pd.concat([df_iris, df_umap_coords, df_tsne_coords], axis='columns')
df_iris_proj.tail()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species,umapX,umapY,tsneX,tsneY
145,6.7,3.0,5.2,2.3,virginica,-6.661554,4.250466,-4.391177,-11.4656
146,6.3,2.5,5.0,1.9,virginica,-5.580895,6.313456,-1.166145,-10.687645
147,6.5,3.0,5.2,2.0,virginica,-6.243613,4.634127,-3.51885,-11.133073
148,6.2,3.4,5.4,2.3,virginica,-6.123681,3.986008,-4.634979,-12.404978
149,5.9,3.0,5.1,1.8,virginica,-5.836457,6.959194,0.06925,-11.517223


## Results

In [11]:
alt.Chart(df_iris_proj).mark_circle(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() | alt.Chart(df_iris_proj).mark_circle(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive()