# Decide on a number of clusters dynamically and vizualize that via Dash



In [1]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

from jupyter_dash import JupyterDash
from jupyter_dash.comms import _send_jupyter_config_comm_request
import pickle

import plotly.graph_objects as go
import plotly.figure_factory as ff

import numpy as np
from scipy.spatial.distance import pdist, squareform

In [2]:
_send_jupyter_config_comm_request()

## Data import - CHECK PATH before running!

In [8]:
with open('results/scenario_1/df_corr.pickle', 'rb') as handle:
    df_corr = pickle.load(handle)
    
with open('results/scenario_1/mds_coordinates.csv', 'rb') as handle:
    mds = pd.read_csv(handle)
    
    
mds = mds.drop(columns=["Unnamed: 0"])
mds = mds.rename(columns={"0":"Dim_1","1":"Dim_2","2":"Dim_3"})

## Plot Dendrogram

In [9]:
df_dist = 1-df_corr

In [10]:
fig = ff.create_dendrogram(df_dist,labels=df_corr.columns, orientation="left")
fig.update_layout(height=800)

## Visualize range of k in MDS space

### prepare data

In [11]:
cluster_options = [int(i) for i in range(2,15)]

In [12]:
# one can not read out the clusters directly from plotly dendrogram
# since plotly wraps around the scipy package where we can read out the cluster for data prep we will use the scipy version directly

from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import cut_tree
from scipy.cluster.hierarchy import linkage

def hierachy_k(dist_M,k):
    '''
    input: dist_M n*n with 1-pearson values, k number of clusters
    output: cluster mapping
    '''
    
    condensed_diss = squareform(dist_M)
    linkage_M = linkage(condensed_diss, method="ward",metric="euclidean")
    
    ct = cut_tree(linkage_M,k)
    
    return ct


clusterings = {}
for i in cluster_options:
    clusterings[str(i)] = [str(k[0]) for k in hierachy_k(df_dist, i)]

In [13]:
df_clusterings = pd.DataFrame.from_dict(clusterings)

In [14]:
df_viz = pd.concat([pd.DataFrame({"keywords":df_corr.index}), df_clusterings, mds], axis=1)

In [15]:
df_viz

Unnamed: 0,keywords,2,3,4,5,6,7,8,9,10,11,12,13,14,Dim_1,Dim_2,Dim_3
0,bucharest,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.469808,0.383945,-0.336961
1,budapest,1,1,1,1,1,1,1,1,1,1,1,1,1,0.344556,-0.211076,0.519983
2,czech,1,1,1,1,1,1,1,1,1,2,2,2,2,0.370315,-0.331191,0.32247
3,hungary,0,0,0,2,2,2,2,2,2,3,3,3,3,-0.022742,0.564786,-0.293064
4,italy,0,2,2,3,3,3,3,3,3,4,4,4,4,-0.534108,-0.39312,0.202934
5,lisbon,0,2,2,3,4,4,4,4,4,5,5,5,5,-0.634678,0.021929,0.416707
6,portugal,0,0,3,4,5,5,5,5,5,6,6,6,6,0.318404,-0.101969,-0.595711
7,prague,0,0,0,2,2,2,6,6,6,7,7,7,7,0.112597,0.687948,0.131939
8,romania,1,1,1,1,1,6,7,7,7,8,8,8,8,0.510669,0.062654,0.330582
9,rome,0,0,3,4,5,5,5,5,8,9,9,9,9,0.269707,-0.179685,-0.433248


###  visualisation

In [16]:

import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

import plotly.express as px
import pandas as pd

app = JupyterDash(__name__)

app.layout = html.Div([
    
    dcc.Graph(id='cluster-3d-graph',
             style={"width":'90vh',"height":"90vh",
                   "display":"block",
                   "margin-left":"auto",
                   "margin-right":"auto"}),
    dcc.Slider(
        id='cluster-slider',
        min=min(cluster_options),
        max=max(cluster_options),
        value=max(cluster_options),
        marks={str(n): str(n) for n in df_viz.columns},
        step=None
    )
    
])


@app.callback(
    Output('cluster-3d-graph', 'figure'),
    Input('cluster-slider', 'value'))

def update_figure(selected_n):

    fig = px.scatter_3d(df_viz, x='Dim_1', y='Dim_2', z='Dim_3', text="keywords",color=str(selected_n),
                    color_discrete_sequence=px.colors.qualitative.Dark24)
    
    
    fig.update_layout(transition_duration=500, legend_title_text="Cluster")

    return fig


if __name__ == '__main__':
    #app.run_server(debug=True)
    app.run_server(mode='inline')