In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import umap
from umap.umap_ import nearest_neighbors
import tsne

In [2]:
tng_photo  = pd.read_csv('./data/tng_features.csv').drop(['Unnamed: 0'], axis=1)
sdss_photo = pd.read_csv('./data/stellar_sdss_phot.csv').drop(['Unnamed: 0'], axis=1)
tweb_df = pd.read_csv('./data/tweb_cw_grid256_rcellsize.csv').drop(['Unnamed: 0'], axis=1)

In [3]:
df = pd.merge(tng_photo, tweb_df, on='ID')
df = pd.merge(sdss_photo, df, on='ID')
df

Unnamed: 0,ID,sdss_Mu,sdss_Mg,sdss_Mr,sdss_Mi,sdss_Mz,galaxy_class,pos_x,pos_y,pos_z,...,tng_Mz,star_metallicity,veldisp,sfr,flag,lambda1,lambda2,lambda3,delta,Environment
0,0.0,-19.918682,-21.427780,-22.115793,-22.478964,-22.796307,1.0,20130.394531,47349.105469,48801.187500,...,-24.677780,0.013450,502.348389,14.369511,1.0,1.954429,2.364169,3.441389,18.383154,0.0
1,1.0,-20.797749,-22.592024,-23.387402,-23.785805,-24.137964,1.0,20396.642578,48362.144531,48838.035156,...,-25.406290,0.024715,439.413666,0.368309,1.0,0.789115,1.492771,2.197094,8.225636,0.0
2,2.0,-21.605118,-22.969065,-23.532661,-23.849113,-24.145828,0.0,20295.615234,46623.386719,49167.097656,...,-24.955791,0.026480,323.013428,16.710186,1.0,0.310030,1.676064,2.537851,8.691577,0.0
3,3.0,-20.388245,-21.703148,-22.265253,-22.584154,-22.873615,0.0,18736.601562,48797.214844,49059.656250,...,-23.318478,0.022510,164.921753,3.290016,1.0,0.073656,0.907090,1.220985,4.218541,0.0
4,4.0,-19.744356,-21.458693,-22.222290,-22.607536,-22.939856,1.0,19634.787109,46360.535156,48662.789062,...,-23.505751,0.022659,191.186371,0.001781,1.0,-0.204095,1.108912,1.627283,1.236518,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364511,4490245.0,-7.376702,-8.596647,-9.148641,-9.423900,-9.556947,0.0,44448.542969,34485.988281,30202.750000,...,-9.575764,0.000000,6.852405,0.000000,1.0,0.013480,0.265502,0.477688,0.304635,0.0
364512,4493007.0,-7.929919,-9.245331,-9.829266,-10.115916,-10.286570,0.0,52411.878906,71483.054688,17944.683594,...,-10.352871,0.000000,3.944225,0.000000,1.0,-0.236132,0.278761,0.338984,-0.161306,1.0
364513,4514472.0,-8.059139,-9.316813,-9.870451,-10.172428,-10.342423,0.0,68211.531250,23783.765625,24512.123047,...,-10.383102,0.000000,3.719493,0.000000,1.0,0.027842,0.042858,0.175058,-0.906812,0.0
364514,4520126.0,-7.635355,-9.033590,-9.685241,-10.004305,-10.200866,1.0,67657.234375,6622.445801,38314.156250,...,-10.267644,0.000000,9.043399,0.000000,1.0,-0.151007,0.164176,0.290989,-0.534059,1.0


In [4]:
df.keys()

Index(['ID', 'sdss_Mu', 'sdss_Mg', 'sdss_Mr', 'sdss_Mi', 'sdss_Mz',
       'galaxy_class', 'pos_x', 'pos_y', 'pos_z', 'vel_x', 'vel_y', 'vel_z',
       'spin_x', 'spin_y', 'spin_z', 'tng_MU', 'tng_MB', 'tng_MV', 'tng_MK',
       'tng_Mg', 'tng_Mr', 'tng_Mi', 'tng_Mz', 'star_metallicity', 'veldisp',
       'sfr', 'flag', 'lambda1', 'lambda2', 'lambda3', 'delta', 'Environment'],
      dtype='object')

In [5]:
predictors = ['tng_MU', 'tng_MB', 'tng_MV', 'tng_MK', 'tng_Mg', 'tng_Mr', 'tng_Mi', 'tng_Mz']
data =df[predictors]

In [6]:
n_neighbors = [5, 50, 100, 250]
min_dists = [0, 0.2, 0.5, 0.9]
normal_embeddings = np.zeros((len(min_dists), len(n_neighbors), len(data), 2))
precomputed_knn_embeddings = np.zeros((len(min_dists), len(n_neighbors), len(data), 2))

In [None]:
%%time
test_model = umap.UMAP(n_neighbors=5, min_dist=0.0, metric='euclidean')
embedding = test_model.fit_transform(data)

In [None]:
pl.scatter(embedding[:,0], embedding[:,1], c=df['sdss_Mg']-df['sdss_Mr'], s=0.1)
pl.colorbar()
pl.show()

In [None]:
fig = pl.figure(figsize=(10,4))
pl.subplot(1,2,1)
pl.scatter(embedding[:,0][df['galaxy_class']==0], embedding[:,1][df['galaxy_class']==0], c=(df['sdss_Mg']-df['sdss_Mr'])[df['galaxy_class']==0], s=0.1)
pl.colorbar()
pl.subplot(1,2,2)
pl.scatter(embedding[:,0][df['galaxy_class']==1], embedding[:,1][df['galaxy_class']==1], c=(df['sdss_Mg']-df['sdss_Mr'])[df['galaxy_class']==1], s=0.1)
pl.colorbar()
pl.show()

In [None]:
%%time
knn = nearest_neighbors(data, n_neighbors=250, metric="euclidean", metric_kwds=None, angular=False, random_state=None)

In [None]:
# Features map using knn precpmputed
for i, k in enumerate(n_neighbors):
    for j, dist in enumerate(min_dists):
        precomputed_knn_embeddings[i, j] = umap.UMAP(n_neighbors=k,
                                                      min_dist=dist,
                                                      precomputed_knn=knn,
                                                      ).fit_transform(data)

In [None]:
fig, axs = pl.subplots(4, 4, figsize=(20, 20))

for i, ax_row in enumerate(axs):
    for j, ax in enumerate(ax_row):
        ax.scatter(precomputed_knn_embeddings[i, j, :, 0],
                   precomputed_knn_embeddings[i, j, :, 1],
                   c=labels / 9,
                   cmap='tab10',
                   alpha=0.1,
                   s=1,
                   )
        ax.set_xticks([])
        ax.set_yticks([])
        if i == 0:
            ax.set_title("min_dist = {}".format(min_dists[j]), size=15)
        if j == 0:
            ax.set_ylabel("n_neighbors = {}".format(n_neighbors[i]), size=15)
fig.suptitle("UMAP embedding of MNIST digits with grid of parameters", y=0.92, size=20)
pl.subplots_adjust(wspace=0.05, hspace=0.05)