In [33]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, push_notebook
from bokeh.layouts import column
from bokeh.models import ColumnDataSource
from ipywidgets import interact, widgets
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from bokeh.palettes import all_palettes as palette
import itertools  
output_notebook()

In [71]:
complete_data = np.loadtxt("Wholesale_customers_data.csv", dtype=int, skiprows=1, delimiter=',')
enc = preprocessing.OneHotEncoder(categorical_features=[0,1], sparse=False)
ed = enc.fit_transform(complete_data) #one hot encode nominal values
scaled_data = preprocessing.scale(ed)

In [72]:
#import 10 colour palette from bokeh
colours= np.array(palette['Category10'][10])
#global max for clusters based on colour palette
max_clusters = 10

In [73]:
global_tsne_val_km = 10
global_n_clus_km = 2
global_tsne_lr_km = 200
tsn_xy = TSNE(perplexity = global_tsne_val_km, learning_rate=global_tsne_lr_km).fit_transform(scaled_data)
km_labels = KMeans(n_clusters = global_n_clus_km).fit_predict(scaled_data)
clrs = colours[km_labels]
kd_tsne = ColumnDataSource(data={
    'x' : tsn_xy[:,0],
    'y' : tsn_xy[:,1],
    'c' : clrs
})
p1 = figure(plot_width=400, plot_height=400, title="KMeans visualization using T-SNE")
kclus_tsne = p1.circle('x', 'y', source=kd_tsne, fill_color='c')

In [74]:
# Need to keep global variable that stores past value of tsne perplexity
## T-SNE is non-deterministic, so we dont want to run it again unless we change parameters
## using global variables here ensures that we only re-run calculations when we change paramters
def update_kmeans_tsne(tsne_perplexity, tsne_learn_rate, n_clus):
    global global_tsne_val_km
    global global_n_clus_km
    global global_tsne_lr_km
    if(global_tsne_val_km != tsne_perplexity) or (global_tsne_lr_km != tsne_learn_rate):
        tsn_xy = TSNE(perplexity = tsne_perplexity, learning_rate = tsne_learn_rate).fit_transform(scaled_data)
        kclus_tsne.data_source.data['x'] = tsn_xy[:,0]
        kclus_tsne.data_source.data['y'] = tsn_xy[:,1]
        global_tsne_val_km = tsne_perplexity
        global_tsne_lr_km = tsne_learn_rate
    if(global_n_clus_km != n_clus):
        km_labels = KMeans(n_clusters = n_clus).fit_predict(scaled_data)
        colours_list = np.array([c[1] for c in zip(range(n_clus), colours)])
        clrs = colours_list[km_labels]
        kclus_tsne.data_source.data['c'] = clrs
        global_n_clus_km = n_clus

    push_notebook()

In [75]:
km_tsne_slider = widgets.IntSlider(value=global_tsne_val_km,min=5,max=50,step=1)
km_cluster_slider = widgets.IntSlider(value=global_n_clus_km,min=2,max=max_clusters,step=1)
km_learn_rate_slider = widgets.FloatSlider(value=global_tsne_lr_km,min=10.0,max=1000.0,step=10.0)
show(p1,notebook_handle=True)
interact(update_kmeans_tsne, tsne_perplexity=km_tsne_slider, tsne_learn_rate=km_learn_rate_slider, n_clus=km_cluster_slider)

<function __main__.update_kmeans_tsne>

In [68]:
init_clus = 3
pca_xy = PCA(n_components=2).fit_transform(scaled_data)
km_labels = KMeans(n_clusters = init_clus).fit_predict(scaled_data)
clrs = colours[km_labels]
kd_pca = ColumnDataSource(data={
    'x' : pca_xy[:,0],
    'y' : pca_xy[:,1],
    'c' : clrs
})

p2 = figure(plot_width=400, plot_height=400, title="KMeans Visualization using PCA")
kclus_pca = p2.circle('x', 'y', source=kd_pca, fill_color='c')

In [69]:
#PCA is deterministic so we do not need to update it, and we dont change dimensionality from 2
def update_kmeans_PCA(n_clus):
    km_labels = KMeans(n_clusters = n_clus).fit_predict(scaled_data)
    clrs = colours[km_labels]
    kclus_pca.data_source.data['c'] = clrs
    push_notebook()
    

In [70]:
km_pca_clus_slider = widgets.IntSlider(value=init_clus,min=2,max=max_clusters,step=1)
show(p2, notebook_handle=True)
interact(update_kmeans_PCA, n_clus=km_pca_clus_slider)

<function __main__.update_kmeans_PCA>

In [60]:
global_tsne_val_ac = 10
global_n_clus_ac = 2
global_tsne_lr_ac = 200
tsn_xy = TSNE(perplexity = global_tsne_val_ac).fit_transform(scaled_data)
ac_labels = AgglomerativeClustering(n_clusters=2).fit_predict(scaled_data)
clrs = colours[ac_labels]
ac_tsne = ColumnDataSource(data={
    'x' : tsn_xy[:,0],
    'y' : tsn_xy[:,1],
    'c' : clrs
})
#DBSCAN
p3 = figure(plot_width=400, plot_height=400, title="Agglomerative clustering visualization using T-SNE")
ac = p3.circle('x', 'y', source=ac_tsne, fill_color='c')

In [65]:
## Need to keep global variable that stores past value of tsne perplexity
## T-SNE is non-deterministic, so we dont want to run it again unless we change parameters
## using global variables here ensures that we only re-run calculations when we change paramters
def update_AC_tsne(tsne_perplexity, tsne_learn_rate, n_clus):
    global global_tsne_val_ac
    global global_tsne_lr_ac
    global global_n_clus_ac
    if(global_tsne_val_ac != tsne_perplexity) or (global_tsne_lr_ac != tsne_learn_rate):
        tsn_xy = TSNE(perplexity = tsne_perplexity, learning_rate = tsne_learn_rate).fit_transform(scaled_data)
        ac.data_source.data['x'] = tsn_xy[:,0]
        ac.data_source.data['y'] = tsn_xy[:,1]
        global_tsne_val_ac = tsne_perplexity
        global_tsne_lr_ac = tsne_learn_rate
    if(global_n_clus_ac != n_clus):
        ac_labels = AgglomerativeClustering(n_clusters=n_clus).fit_predict(scaled_data)
        clrs = colours[ac_labels]
        ac.data_source.data['c'] = clrs
        global_n_clus_ac = n_clus
    push_notebook()

In [67]:
ac_tsne_slider = widgets.IntSlider(value=global_tsne_val_ac,min=5,max=50,step=1)
ac_cluster_slider = widgets.IntSlider(value=global_n_clus_ac,min=2,max=max_clusters,step=1)
ac_learn_rate_slider = widgets.FloatSlider(value=global_n_clus_ac,min=10.0,max=1000.0,step=10.0)
show(p3, notebook_handle=True)
interact(update_AC_tsne, tsne_perplexity=ac_tsne_slider,tsne_learn_rate=ac_learn_rate_slider, n_clus=ac_cluster_slider)

<function __main__.update_AC_tsne>