In [20]:
import csv
import collections
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy
import networkx as nx
from operator import itemgetter

from sklearn.metrics.pairwise import pairwise_distances

import matplotlib.pyplot as plt
from matplotlib import pyplot, patches

import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
import plotly.graph_objs as go
import plotly.offline as offline
from tqdm._tqdm_notebook import tqdm

In [21]:
init_notebook_mode(connected=True)

In [22]:
df = pd.read_feather('finaldoc2topic.feather')

In [23]:
def jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [24]:
df.shape   # 237

(237, 18)

In [25]:
df.head()

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18
0,0.031864,0.001086,0.005289,0.007907,0.006254,0.015224,0.013545,0.003002,0.000764,0.619946,1.4e-05,1.4e-05,0.10289,0.090591,0.011609,0.054163,0.013609,0.022229
1,0.000827,0.081257,0.002911,0.053077,0.575448,0.005392,0.000505,0.007939,0.009347,0.003647,0.034276,0.013535,0.016207,0.006142,0.006792,0.173229,0.003961,0.005508
2,0.007088,0.222558,0.009247,0.10758,0.003468,0.009972,0.016713,0.016657,0.046364,0.008369,0.040571,0.10203,0.028241,0.012212,0.009585,0.318069,0.021889,0.019387
3,0.00486,0.000439,0.021188,0.011603,0.042555,0.007211,0.012455,1.2e-05,0.092979,1.2e-05,0.008487,1.2e-05,1.2e-05,0.041125,1.2e-05,0.03354,0.616419,0.10708
4,0.008916,0.428269,0.010909,0.016861,0.001839,0.003052,0.004282,0.003109,0.03695,0.013114,0.035827,0.016337,0.013026,0.085354,0.005334,0.101793,0.177481,0.037548


In [26]:
data = df.as_matrix()

# Dash

In [27]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go

## Create Data for Dash

In [28]:
# Pairwise Jensen-Shannon distance between each pair of observations based on the 18 topic-probabilities
pairwise_dist = pairwise_distances(X=data, metric=jensen_shannon)

In [29]:
threshold_all = [0.52, 0.54, 0.56, 0.58, 0.60]    # arbitrary threshold for deciding whether 2 observations are 'similar' or not
adjacency = [np.where(pairwise_dist > threshold, 1, 0) for threshold in threshold_all]

In [30]:
# map threshold value to adjacency matrix # save
thresh_to_adj = {thresh: adj for thresh, adj in zip(threshold_all, adjacency)}

In [31]:
def create_graph(adj):
    # input: adjaccency matrix
    # returns a graph with the isolates removed
    G = nx.from_numpy_matrix(adj)
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)
    return G

In [32]:
# map threshold value to graph # save
# thresh_to_graph = {thresh: nx.from_numpy_matrix(adj) for thresh, adj in zip(threshold_all, adjacency)}
thresh_to_graph = {thresh: create_graph(adj) for thresh, adj in zip(threshold_all, adjacency)}

In [33]:
# extract node positions
fruchterman_k = 5
fruchterman_iter = 1000

# map threshold values to positions of nodes # save
thresh_to_pos = {}

for thresh in thresh_to_graph:
    graph = nx.fruchterman_reingold_layout(thresh_to_graph[thresh], k = fruchterman_k, iterations=fruchterman_iter)
    thresh_to_pos[thresh] = graph


In [34]:
thresh_to_XnYn = {} # save
for thresh in tqdm(thresh_to_pos):
    pos = thresh_to_pos[thresh]
    # define lists of node coordinates
    Xn = [pos[k][0] for k in sorted(pos.keys())]
    Yn = [pos[k][1] for k in sorted(pos.keys())]
    thresh_to_XnYn[thresh] = (Xn, Yn)
    

100%|██████████| 5/5 [00:00<00:00, 11250.82it/s]


In [38]:
threshold_all = [0.52, 0.54, 0.56, 0.58, 0.60]
def th_mark(x):
    if x==np.min(threshold_all):
        return 'Low'
    elif x==np.max(threshold_all):
        return 'High'
    else:
        return ''
threshold_mark = {str(th):th_mark(th) for th in threshold_all}


In [None]:
app = dash.Dash()

app.layout = html.Div([
    html.Div([
    dcc.Graph(id='graph-with-slider')
    ],style={'width': '60%', 'pad': 100}),
    html.Div([
    html.H2('Similarity Cutoff'),
    dcc.Slider(
        id='threshold-slider',
        min=min(threshold_all),
        max=max(threshold_all),
        value=threshold_all[int(np.floor(len(threshold_all)/2))],
        step=None,
        marks=threshold_mark
    )
    ], style={'width': '47%','marginBottom': 0, 'marginTop': 0,'marginLeft':135, 
              'fontSize':15, 'font-family': 'Arial'})
])

@app.callback(
    dash.dependencies.Output('graph-with-slider', 'figure'),
    [dash.dependencies.Input('threshold-slider', 'value')])
def update_figure(selected_threshold):


# Work to be done: subset the Xn and Yn for given threshold
    Xn, Yn = thresh_to_XnYn[selected_threshold]

# define a trace for plotly
    trace_nodes = dict(type='scatter', 
                       x=Xn, 
                       y=Yn,
                       mode='markers',
                       marker=dict(symbol='dot', 
                                   size=10),
                       showlegend=True, 
                       visible=True)

# record the coordinates of the ends of edges
    Xe = []
    Ye = []
    G = thresh_to_graph[selected_threshold]
    for e in G.edges():
        pos = thresh_to_pos[selected_threshold]
        Xe.extend([pos[e[0]][0], pos[e[1]][0], None])
        Ye.extend([pos[e[0]][1], pos[e[1]][1], None])

# trace_edges defines the graph edges as a trace of type scatter (line)
    trace_edges=dict(type='scatter',
                     mode='lines',
                     x=Xe,
                     y=Ye,
                     line=dict(width=0.1, color='rgb(51, 51, 51)'),
                     hoverinfo='none', showlegend=False)

    axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title='' 
              )
    layout=dict(title= 'Network of Restaurants based on User Reviews',  
                font= dict(family='Balto'),
                width=1000,
                height=800,
                autosize=False,
                showlegend=True,
                xaxis=axis,
                yaxis=axis,
                margin=dict(
                l=40,
                r=40,
                b=85,
                t=100,
                pad=0,
       
        ),
        hovermode='closest',
        plot_bgcolor='#efecea', #set background color            
        )


    return {
        'data': [trace_edges, trace_nodes],
        'layout': layout}


if __name__ == '__main__':
    app.run_server(host='0.0.0.0')

 * Running on http://0.0.0.0:8050/ (Press CTRL+C to quit)
69.181.104.47 - - [06/May/2018 22:53:14] "GET / HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:14] "GET /_dash-layout HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:14] "GET /_dash-dependencies HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:14] "POST /_dash-update-component HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:14] "GET /favicon.ico HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:22] "GET / HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:23] "GET /_dash-layout HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:45] "GET /_dash-dependencies HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 22:53:45] "POST /_dash-update-component HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 23:04:32] "GET / HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 23:04:32] "GET /_dash-dependencies HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2018 23:04:32] "GET /_dash-layout HTTP/1.1" 200 -
69.181.104.47 - - [06/May/2

# To do:

Find good values of k for each threshold value -  and set k as well when you plot