In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()
alt.themes.enable('dark')

from sklearn import manifold
from sklearn import preprocessing

from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
from umap import UMAP

from vega_datasets import data
import sklearn.datasets

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
alt.Chart(df).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='species:N'
).properties(
    width=150,
    height=150
).repeat(
    row=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'],
    column=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
).interactive()

# Projections
Instead of dropping the categorical columns (species, island, sex) we one hot encode them for the projection.


In [4]:
features = pd.get_dummies(df).dropna()
features

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male
0,39.1,18.7,181.0,3750.0,1,0,0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,1,0,0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,1,0,0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,1,0,0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,1,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,0,0,1,1,0,0,1,0
340,46.8,14.3,215.0,4850.0,0,0,1,1,0,0,1,0
341,50.4,15.7,222.0,5750.0,0,0,1,1,0,0,0,1
342,45.2,14.8,212.0,5200.0,0,0,1,1,0,0,1,0


In [5]:
features = preprocessing.MinMaxScaler().fit_transform(features) # scale numerical to [0,1]


## TSNE

In [6]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)



In [7]:
%time tsne = tsne.fit(features)

Iteration   50, KL divergence  1.1828, 50 iterations in 1.8355 sec
Iteration  100, KL divergence  0.8852, 50 iterations in 1.8627 sec
Iteration  150, KL divergence  0.8089, 50 iterations in 1.6227 sec
Iteration  200, KL divergence  0.7718, 50 iterations in 1.7032 sec
Iteration  250, KL divergence  0.7497, 50 iterations in 1.8321 sec
Iteration   50, KL divergence  0.1742, 50 iterations in 1.6782 sec
Iteration  100, KL divergence  0.1311, 50 iterations in 1.6060 sec
Iteration  150, KL divergence  0.1241, 50 iterations in 1.8318 sec
Iteration  200, KL divergence  0.1188, 50 iterations in 1.7708 sec
Iteration  250, KL divergence  0.1193, 50 iterations in 1.7273 sec
Iteration  300, KL divergence  0.1153, 50 iterations in 2.0725 sec
Iteration  350, KL divergence  0.1128, 50 iterations in 2.2267 sec
Iteration  400, KL divergence  0.1167, 50 iterations in 1.8615 sec
Iteration  450, KL divergence  0.1122, 50 iterations in 1.7270 sec
Iteration  500, KL divergence  0.1130, 50 iterations in 1.5875

In [8]:
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])

## UMAP

In [9]:
reducer = UMAP()
%time umap = reducer.fit_transform(features)

CPU times: user 16 s, sys: 741 ms, total: 16.8 s
Wall time: 14 s


In [10]:
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])
df_proj = pd.concat([df, df_umap_coords, df_tsne_coords], axis='columns')
df_proj.tail()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,umapX,umapY,tsneX,tsneY
339,Gentoo,Biscoe,,,,,,3.147047,18.836756,19.485827,3.586282
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,-7.418738,11.541223,12.175789,-17.806582
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,3.577934,19.644505,18.707702,1.846964
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,,,,
343,Gentoo,Biscoe,49.9,16.1,213.0,5400.0,Male,,,,


## Results w/ species

In [11]:
alt.Chart(df_proj).mark_point(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected penguins data"
).interactive() | alt.Chart(df_proj).mark_point(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected penguins data"
).interactive()

In [28]:
alt.Chart(df_proj).mark_circle(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color=alt.Color('island', scale=alt.Scale(scheme='dark2'))
).properties(
    width=500,
    height=400,
    title="UMAP projected penguins data"
).interactive() | alt.Chart(df_proj).mark_circle(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color=alt.Color('island', scale=alt.Scale(scheme='dark2'))
).properties(
    width=500,
    height=400,
    title="TSNE projected penguins data"
).interactive()

In [13]:
alt.Chart(df_proj).mark_circle(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color=alt.Color('sex', scale=alt.Scale(scheme='set1'))
).properties(
    width=500,
    height=400,
    title="UMAP projected penguins data"
).interactive() | alt.Chart(df_proj).mark_circle(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color=alt.Color('sex', scale=alt.Scale(scheme='set1'))
).properties(
    width=500,
    height=400,
    title="TSNE projected penguins data"
).interactive()

### without species

In [34]:
features = pd.get_dummies(df.drop('species', axis=1)).dropna()
features = preprocessing.MinMaxScaler().fit_transform(features) # scale numerical to [0,1]

tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
).fit(features)
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])

umap = reducer.fit_transform(features)
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])

df_proj = pd.concat([df, df_umap_coords, df_tsne_coords], axis='columns')



Iteration   50, KL divergence  1.2065, 50 iterations in 1.7963 sec
Iteration  100, KL divergence  1.0106, 50 iterations in 1.8172 sec
Iteration  150, KL divergence  0.9516, 50 iterations in 1.5886 sec
Iteration  200, KL divergence  0.9225, 50 iterations in 1.8248 sec
Iteration  250, KL divergence  0.9056, 50 iterations in 1.6807 sec
Iteration   50, KL divergence  0.1899, 50 iterations in 1.5853 sec
Iteration  100, KL divergence  0.1484, 50 iterations in 1.6293 sec
Iteration  150, KL divergence  0.1394, 50 iterations in 1.8299 sec
Iteration  200, KL divergence  0.1376, 50 iterations in 1.7159 sec
Iteration  250, KL divergence  0.1346, 50 iterations in 1.7020 sec
Iteration  300, KL divergence  0.1338, 50 iterations in 1.7268 sec
Iteration  350, KL divergence  0.1319, 50 iterations in 1.6982 sec
Iteration  400, KL divergence  0.1299, 50 iterations in 1.7118 sec
Iteration  450, KL divergence  0.1315, 50 iterations in 1.6734 sec
Iteration  500, KL divergence  0.1305, 50 iterations in 1.8245

In [35]:

alt.Chart(df_proj).mark_point(
    opacity=0.6
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected penguins data"
).interactive() | alt.Chart(df_proj).mark_point(
    opacity=0.6
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected penguins data"
).interactive()

# Centroid Position = mean of projected x/y coordinates

In [14]:
proj_species_mean = df_proj.groupby(['species']).mean().reset_index()
proj_species_mean

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,umapX,umapY,tsneX,tsneY
0,Adelie,38.791391,18.346358,189.953642,3700.662252,0.909972,0.919238,-12.983432,0.22617
1,Chinstrap,48.833824,18.420588,195.823529,3733.088235,18.260567,7.359754,0.789377,11.447087
2,Gentoo,47.504878,14.982114,217.186992,5076.01626,-1.545835,15.15399,15.736099,-6.662129


In [15]:
proj_island_mean = df_proj.groupby(['island']).mean().reset_index()
proj_island_mean

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,umapX,umapY,tsneX,tsneY
0,Biscoe,45.257485,15.87485,209.706587,4716.017964,-2.121365,12.701043,8.115547,-4.856868
1,Dream,44.167742,18.344355,193.072581,3712.903226,13.868167,6.135224,-2.734716,5.744399
2,Torgersen,38.95098,18.429412,191.196078,3706.372549,-3.385842,-7.310898,-19.386078,1.806434


In [16]:
proj_sex_mean = df_proj.groupby(['sex']).mean().reset_index()
proj_sex_mean

Unnamed: 0,sex,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,umapX,umapY,tsneX,tsneY
0,Female,42.09697,16.425455,197.363636,3862.272727,6.214885,9.342233,1.627767,6.688615
1,Male,45.854762,17.891071,204.505952,4545.684524,1.070043,5.49071,-1.49769,-6.626365


In [27]:
alt.Chart(df_proj).mark_circle(
    opacity=0.4
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() + alt.Chart(proj_species_mean).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond'
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() + alt.Chart(pd.DataFrame(df_proj.mean()).transpose()).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond',
    color='black'
).encode(
    x='umapX:Q',
    y='umapY:Q'
).properties(
    width=500,
    height=400,
).interactive() | alt.Chart(df_proj).mark_circle(
    opacity=0.4
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive() + alt.Chart(proj_species_mean).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond'
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() + alt.Chart(pd.DataFrame(df_proj.mean()).transpose()).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond',
    color='black'
).encode(
    x='tsneX:Q',
    y='tsneY:Q'
).properties(
    width=500,
    height=400,
).interactive()

# Centroid Position = mean of high dimensional data

Drawback: needs out of sample extension or a new projection

In [18]:
features = pd.get_dummies(df.drop('species', axis=1))
features = features.join(df['species'])
species_means = features.groupby('species').mean().reset_index()
features = pd.get_dummies(features)
total_mean = features.mean()


In [19]:
df_w_means = features \
    .append(pd.get_dummies(species_means)) \
    .append(total_mean, ignore_index=True) \
    .dropna().reset_index(drop=True)
df_w_means

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male,species_Adelie,species_Chinstrap,species_Gentoo
0,39.100000,18.700000,181.000000,3750.000000,0.000000,0.000000,1.000000,0.000000,1.000000,1.00000,0.000000,0.000000
1,39.500000,17.400000,186.000000,3800.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000
2,40.300000,18.000000,195.000000,3250.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000
3,36.700000,19.300000,193.000000,3450.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000
4,39.300000,20.600000,190.000000,3650.000000,0.000000,0.000000,1.000000,0.000000,1.000000,1.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
341,49.900000,16.100000,213.000000,5400.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.00000,0.000000,1.000000
342,38.791391,18.346358,189.953642,3700.662252,0.289474,0.368421,0.342105,0.480263,0.480263,1.00000,0.000000,0.000000
343,48.833824,18.420588,195.823529,3733.088235,0.000000,1.000000,0.000000,0.500000,0.500000,0.00000,1.000000,0.000000
344,47.504878,14.982114,217.186992,5076.016260,1.000000,0.000000,0.000000,0.467742,0.491935,0.00000,0.000000,1.000000


In [20]:
features_w_means = preprocessing.MinMaxScaler().fit_transform(df_w_means) # scale numerical to [0,1]

In [21]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)

tsne = tsne.fit(features_w_means)
df_tsne_coords = pd.DataFrame(tsne, columns=['tsneX','tsneY'])



Iteration   50, KL divergence  1.2619, 50 iterations in 1.7116 sec
Iteration  100, KL divergence  0.9602, 50 iterations in 1.6830 sec
Iteration  150, KL divergence  0.8750, 50 iterations in 1.7171 sec
Iteration  200, KL divergence  0.8388, 50 iterations in 1.7671 sec
Iteration  250, KL divergence  0.8155, 50 iterations in 1.6482 sec
Iteration   50, KL divergence  0.1810, 50 iterations in 1.6655 sec
Iteration  100, KL divergence  0.1458, 50 iterations in 1.6777 sec
Iteration  150, KL divergence  0.1378, 50 iterations in 1.6866 sec
Iteration  200, KL divergence  0.1358, 50 iterations in 1.6298 sec
Iteration  250, KL divergence  0.1332, 50 iterations in 1.6480 sec
Iteration  300, KL divergence  0.1330, 50 iterations in 1.5934 sec
Iteration  350, KL divergence  0.1310, 50 iterations in 1.6027 sec
Iteration  400, KL divergence  0.1304, 50 iterations in 1.6342 sec
Iteration  450, KL divergence  0.1303, 50 iterations in 1.6881 sec
Iteration  500, KL divergence  0.1277, 50 iterations in 1.6216

In [22]:
reducer = UMAP()
umap = reducer.fit_transform(features_w_means)
df_umap_coords = pd.DataFrame(umap, columns=['umapX','umapY'])

In [23]:
df_proj_means = pd.concat([df_w_means, df_umap_coords, df_tsne_coords], axis='columns')
df_proj_means[342:]

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male,species_Adelie,species_Chinstrap,species_Gentoo,umapX,umapY,tsneX,tsneY
342,38.791391,18.346358,189.953642,3700.662252,0.289474,0.368421,0.342105,0.480263,0.480263,1.0,0.0,0.0,14.365026,15.831412,-10.065147,5.332857
343,48.833824,18.420588,195.823529,3733.088235,0.0,1.0,0.0,0.5,0.5,0.0,1.0,0.0,-10.784004,2.027154,4.787337,18.18896
344,47.504878,14.982114,217.186992,5076.01626,1.0,0.0,0.0,0.467742,0.491935,0.0,0.0,1.0,1.948084,8.001014,18.502996,-2.593374
345,43.92193,17.15117,200.915205,4201.754386,0.488372,0.360465,0.151163,0.479651,0.488372,0.44186,0.197674,0.360465,23.400063,10.023719,-10.050495,5.353776


In [24]:
def get_species(row):
    for c in ['species_Adelie', 'species_Chinstrap', 'species_Gentoo']:
        if row[c]==1:
            return c

In [25]:
df_proj_means['species'] = df_proj_means.apply(get_species, axis=1)
df_proj_means

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male,species_Adelie,species_Chinstrap,species_Gentoo,umapX,umapY,tsneX,tsneY,species
0,39.100000,18.700000,181.000000,3750.000000,0.000000,0.000000,1.000000,0.000000,1.000000,1.00000,0.000000,0.000000,16.416300,-0.858982,-2.241017,2.411507,species_Adelie
1,39.500000,17.400000,186.000000,3800.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000,6.864489,-3.594776,-6.154075,-1.769424,species_Adelie
2,40.300000,18.000000,195.000000,3250.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000,6.946867,-3.511586,-5.916976,-1.707740,species_Adelie
3,36.700000,19.300000,193.000000,3450.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.00000,0.000000,0.000000,6.663188,-3.287148,-5.676546,-1.822454,species_Adelie
4,39.300000,20.600000,190.000000,3650.000000,0.000000,0.000000,1.000000,0.000000,1.000000,1.00000,0.000000,0.000000,16.195524,-1.009139,-1.843254,2.444481,species_Adelie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,49.900000,16.100000,213.000000,5400.000000,1.000000,0.000000,0.000000,0.000000,1.000000,0.00000,0.000000,1.000000,3.005814,7.867781,20.526555,-1.130193,species_Gentoo
342,38.791391,18.346358,189.953642,3700.662252,0.289474,0.368421,0.342105,0.480263,0.480263,1.00000,0.000000,0.000000,14.365026,15.831412,-10.065147,5.332857,species_Adelie
343,48.833824,18.420588,195.823529,3733.088235,0.000000,1.000000,0.000000,0.500000,0.500000,0.00000,1.000000,0.000000,-10.784004,2.027154,4.787337,18.188960,species_Chinstrap
344,47.504878,14.982114,217.186992,5076.016260,1.000000,0.000000,0.000000,0.467742,0.491935,0.00000,0.000000,1.000000,1.948084,8.001014,18.502996,-2.593374,species_Gentoo


In [26]:
alt.Chart(df_proj_means[:342]).mark_circle(
    opacity=0.4
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
    title="UMAP projected iris data"
).interactive() + alt.Chart(df_proj_means[342:]).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond'
).encode(
    x='umapX',
    y='umapY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive() | alt.Chart(df_proj_means[:342]).mark_circle(
    opacity=0.4
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
    title="TSNE projected iris data"
).interactive() + alt.Chart(df_proj_means[342:]).mark_point(
    size=300,
    opacity=0.7,
    shape='diamond'
).encode(
    x='tsneX',
    y='tsneY',
    color='species'
).properties(
    width=500,
    height=400,
).interactive()