In [1]:
import dash
from dash.dependencies import Input, Output
import dash_html_components as html
import dash_core_components as dcc
import pandas as pd
import flask
from flask_cors import CORS
import os
import numpy as np

from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats import entropy
import networkx as nx

In [2]:
df = pd.read_csv('./small_molecule_drugbank.csv').drop(['Unnamed: 0'],axis=1)

In [3]:
df.head()

Unnamed: 0,NAME,PAGE,IMG_URL,LOGP,PKA,MW,FORM,SOL,DESC
0,Cyclacillin,https://www.drugbank.ca/drugs/DB01000,https://www.drugbank.ca/structures/DB01000/thu...,1.31,3.3,341.426,C15H23N3O4S,1.9,A cyclohexylamido analog of penicillanic acid....
1,Salbutamol,https://www.drugbank.ca/drugs/DB01001,https://www.drugbank.ca/structures/DB01001/thu...,1.4,10.12,239.3107,C13H21NO3,2.15,"Salbutamol is a short-acting, selective beta2-..."
2,Levobupivacaine,https://www.drugbank.ca/drugs/DB01002,https://www.drugbank.ca/structures/DB01002/thu...,3.6,13.62,288.4277,C18H28N2O,0.0977,Levobupivacaine is an amino-amide local anaest...
3,Cromoglicic acid,https://www.drugbank.ca/drugs/DB01003,https://www.drugbank.ca/structures/DB01003/thu...,1.92,1.77,468.3665,C23H16O11,0.0358,A chromone complex that acts by inhibiting the...
4,Ganciclovir,https://www.drugbank.ca/drugs/DB01004,https://www.drugbank.ca/structures/DB01004/thu...,-1.66,10.16,255.2306,C9H13N5O4,11.5,An acyclovir analog that is a potent inhibitor...


In [4]:
df_yelp = pd.read_feather('./df_final_doc2topics.feather')
df_yelp.drop(['business_id'], axis=1, inplace=True)
df = df.iloc[:df_yelp.shape[0]]  # reduce num of rows to be same as df_yelp


In [5]:
data = df_yelp.drop(['name', 'is_strip', 'stars'], axis=1).as_matrix()

In [6]:
def jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

# Pairwise Jensen-Shannon distance between each pair of observations based on the 18 topic-probabilities
pairwise_dist = pairwise_distances(X=data, metric=jensen_shannon)

In [7]:
# find the topic most closely related to each restaurant
topic_closest_ind = np.argmax(data, axis=1)

topic_names_ord = ['Cost & Quality', 'Bars', 'Casino Hotel', 'Fine Dining', 'Asian', 'Pizza', 'Steakhouse', 
                   'Italian', 'Coffee Shop', 'High Customer Satisfaction', 'Night Club', 'Wait Time', 'Mexican', 
                   'Lunch', 'Sushi', 'Fast Food', 'Breakfast', 'Low Customer Satisfaction']

# names of topics most closely related to each restaurant (ordered by the order of restaurants in df)
topic_closest = [topic_names_ord[ind] for ind in topic_closest_ind]

In [8]:
# find the second most closely related topic for each restaurant
topic_closest_ind = np.argsort(data, axis=1)[::-1][:, :3]

In [9]:
topic_closest_ind

array([[ 0,  5, 13],
       [ 7, 11,  3],
       [ 1,  3,  7],
       [ 9,  4,  3],
       [ 9,  0,  7],
       [ 8,  2,  7],
       [14,  6,  8],
       [ 4,  0,  7],
       [ 6, 11,  2],
       [ 8,  2,  4],
       [ 0,  5,  7],
       [15,  6,  7],
       [ 9,  7,  5],
       [ 0,  1,  3],
       [17,  7,  9],
       [ 9,  5,  2],
       [ 4,  7, 12],
       [ 6, 12, 16],
       [11,  5,  8],
       [ 2, 11,  7],
       [11,  3, 17],
       [ 0, 14,  7],
       [ 7,  4, 13],
       [ 4,  6,  5],
       [ 6,  8, 10],
       [17,  2,  9],
       [ 1,  5,  6],
       [14,  7,  9],
       [ 7,  9,  2],
       [ 6,  7,  5],
       [ 0, 14,  3],
       [ 2,  7,  0],
       [ 6,  4,  5],
       [ 9,  4, 17],
       [ 4, 14,  3],
       [16,  3,  4],
       [ 7,  3,  5],
       [ 4, 12, 14],
       [ 7,  6,  5],
       [ 6,  1,  5],
       [ 5,  6,  2],
       [ 9, 14,  0],
       [ 6,  5,  1],
       [ 4,  5,  6],
       [ 6,  7,  3],
       [ 2,  9,  6],
       [12,  7,  2],
       [13,  

In [10]:
topic_names_ord = ['Cost & Quality', 'Bars', 'Casino Hotel', 'Fine Dining', 'Asian', 'Pizza', 'Steakhouse', 
                   'Italian', 'Coffee Shop', 'High Customer Satisfaction', 'Night Club', 'Wait Time', 'Mexican', 
                   'Lunch', 'Sushi Restaurant', 'Fast Food', 'Breakfast', 'Low Customer Satisfaction']

# names of topics most closely related to each restaurant (ordered by the order of restaurants in df)
topic_closest = [[topic_names_ord[ind] for ind in rest] for rest in topic_closest_ind]
# topic_closest = [topic_names_ord[ind] for ind in topic_closest_ind]

In [11]:
topic_closest

[['Cost & Quality', 'Pizza', 'Lunch'],
 ['Italian', 'Wait Time', 'Fine Dining'],
 ['Bars', 'Fine Dining', 'Italian'],
 ['High Customer Satisfaction', 'Asian', 'Fine Dining'],
 ['High Customer Satisfaction', 'Cost & Quality', 'Italian'],
 ['Coffee Shop', 'Casino Hotel', 'Italian'],
 ['Sushi Restaurant', 'Steakhouse', 'Coffee Shop'],
 ['Asian', 'Cost & Quality', 'Italian'],
 ['Steakhouse', 'Wait Time', 'Casino Hotel'],
 ['Coffee Shop', 'Casino Hotel', 'Asian'],
 ['Cost & Quality', 'Pizza', 'Italian'],
 ['Fast Food', 'Steakhouse', 'Italian'],
 ['High Customer Satisfaction', 'Italian', 'Pizza'],
 ['Cost & Quality', 'Bars', 'Fine Dining'],
 ['Low Customer Satisfaction', 'Italian', 'High Customer Satisfaction'],
 ['High Customer Satisfaction', 'Pizza', 'Casino Hotel'],
 ['Asian', 'Italian', 'Mexican'],
 ['Steakhouse', 'Mexican', 'Breakfast'],
 ['Wait Time', 'Pizza', 'Coffee Shop'],
 ['Casino Hotel', 'Wait Time', 'Italian'],
 ['Wait Time', 'Fine Dining', 'Low Customer Satisfaction'],
 ['Cost 

In [12]:
vis = pd.read_pickle('vis.pkl')

In [13]:
def get_relevant_words(vis,lam=0.3,topn=10):
    a = vis.topic_info
    a['finalscore'] = a['logprob']*lam+(1-lam)*a['loglift']
    a = a.loc[:,['Category','Term','finalscore']].groupby(['Category'])\
    .apply(lambda x: x.sort_values(by='finalscore',ascending=False).head(topn))
    a = a.loc[:,'Term'].reset_index().loc[:,['Category','Term']]
    a = a[a['Category']!='Default']
    a = a.to_dict('split')['data']
    d ={}
    for k,v in a: 
        if k not in d.keys():
            d[k] =set()
            d[k].add(v)
        else:
            d[k].add(v)
    finalData = pd.DataFrame([],columns=['Topic','words with Relevance'])
    finalData['Topic']=d.keys()
    finalData['words with Relevance']=d.values()
    return finalData

In [14]:
def get_top_n_words_list(num_topics, vis, lam=0.6, topn=5):
    """returns a sorted list of top n words, where the list follows the order Topic 1, ..., Topic n.
    Each element of the list is a string composed of a list of the top n words
    num_topics: number of topics
    vis: pyLDAvis object
    lam: relevance value
    topn: number of topics
    """
    topic_ids_ordered = ['Topic' + str(num) for num in range(1, num_topics + 1)]
    top_topic_words_df = get_relevant_words(vis, lam, topn)
    top_topic_words_df.set_index('Topic', drop=True, inplace=True)
    top_topic_words = [top_topic_words_df.loc[topic_id]['words with Relevance'] for topic_id in topic_ids_ordered]
    top_topic_words_display = [', '.join(words) for words in top_topic_words]
    return top_topic_words_display

In [15]:
# sample usage and output
len(get_top_n_words_list(num_topics=18, vis=vis, lam=0.6, topn=3))

18

In [16]:
top3words = get_top_n_words_list(num_topics=18, vis=vis, lam=0.6, topn=3)

In [17]:
top3words

['food, good, price',
 'drink, beer, bar',
 'hotel, stay, buffet',
 'restaurant, dinner, dessert',
 'noodle, thai, rice',
 'slice, wing, pizza',
 'steakhouse, cook, steak',
 'salad, pasta, bread',
 'coffee, tea, chocolate',
 'food, service, great',
 'business, door, walk',
 'minute, table, wait',
 'taco, burrito, mexican',
 'lunch, location, place',
 'roll, fish, sushi',
 'burger, fry, shake',
 'egg, chicken, sandwich',
 'tell, order, bad']

In [18]:
topic_names_ord

['Cost & Quality',
 'Bars',
 'Casino Hotel',
 'Fine Dining',
 'Asian',
 'Pizza',
 'Steakhouse',
 'Italian',
 'Coffee Shop',
 'High Customer Satisfaction',
 'Night Club',
 'Wait Time',
 'Mexican',
 'Lunch',
 'Sushi Restaurant',
 'Fast Food',
 'Breakfast',
 'Low Customer Satisfaction']

In [19]:
topic_relwords_mapper = {}
for i in range(len(topic_names_ord)):
    topic_relwords_mapper[topic_names_ord[i]] = top3words[i]


In [20]:
topic_closest[0]

['Cost & Quality', 'Pizza', 'Lunch']

In [21]:
topic1_for_disp = []
topic2_for_disp = []
topic3_for_disp = []
for review in topic_closest:
    str1 = review[0] + ': ' + topic_relwords_mapper[review[0]]
    topic1_for_disp.append(str1)
    str2 = review[1] + ': ' + topic_relwords_mapper[review[1]]
    topic2_for_disp.append(str2)
    str3 = review[2] + ': ' + topic_relwords_mapper[review[2]]
    topic3_for_disp.append(str3)

In [22]:
topic2_for_disp

['Pizza: slice, wing, pizza',
 'Wait Time: minute, table, wait',
 'Fine Dining: restaurant, dinner, dessert',
 'Asian: noodle, thai, rice',
 'Cost & Quality: food, good, price',
 'Casino Hotel: hotel, stay, buffet',
 'Steakhouse: steakhouse, cook, steak',
 'Cost & Quality: food, good, price',
 'Wait Time: minute, table, wait',
 'Casino Hotel: hotel, stay, buffet',
 'Pizza: slice, wing, pizza',
 'Steakhouse: steakhouse, cook, steak',
 'Italian: salad, pasta, bread',
 'Bars: drink, beer, bar',
 'Italian: salad, pasta, bread',
 'Pizza: slice, wing, pizza',
 'Italian: salad, pasta, bread',
 'Mexican: taco, burrito, mexican',
 'Pizza: slice, wing, pizza',
 'Wait Time: minute, table, wait',
 'Fine Dining: restaurant, dinner, dessert',
 'Sushi Restaurant: roll, fish, sushi',
 'Asian: noodle, thai, rice',
 'Steakhouse: steakhouse, cook, steak',
 'Coffee Shop: coffee, tea, chocolate',
 'Casino Hotel: hotel, stay, buffet',
 'Pizza: slice, wing, pizza',
 'Italian: salad, pasta, bread',
 'High Cus

In [23]:
len(topic1_for_disp)

237

In [24]:
topic_relwords_mapper['Italian']

'salad, pasta, bread'

In [25]:
threshold2k ={
    0.55: 0.7,
    0.56: 0.9,
    0.57: 0.3,
    0.58: 5,
    0.59: 2,
    0.6: 5,
    0.61: 5,
    0.62: 5
}

In [26]:
# arbitrary threshold for deciding whether 2 observations are 'similar' or not
threshold_all = [0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62]
def th_mark(x):
    if x==np.min(threshold_all):
        return 'Low'
    elif x==np.max(threshold_all):
        return 'High'
    else:
        return ''
    
threshold_mark = {str(th):th_mark(th) for th in threshold_all}
adjacency = [np.where(pairwise_dist > threshold, 1, 0) for threshold in threshold_all]

In [27]:
# map threshold value to adjacency matrix
thresh_to_adj = {thresh: adj for thresh, adj in zip(threshold_all, adjacency)}

In [28]:
def create_graph(adj):
    # input: adjaccency matrix
    # returns a graph with the isolates removed
    G = nx.from_numpy_matrix(adj)
    isolates = list(nx.isolates(G))
    G.remove_nodes_from(isolates)
    return G

In [29]:
# map threshold value to graph
thresh_to_graph = {thresh: create_graph(adj) for thresh, adj in zip(threshold_all, adjacency)}

In [30]:
# extract node positions
fruchterman_iter = 1000

# map threshold values to positions of nodes
thresh_to_pos = {}

for thresh in thresh_to_graph:
    graph = nx.fruchterman_reingold_layout(thresh_to_graph[thresh], k = threshold2k[thresh], iterations=fruchterman_iter)
    thresh_to_pos[thresh] = graph

In [31]:
thresh_to_XnYn = {}
for thresh in thresh_to_pos:
    pos = thresh_to_pos[thresh]
    # define lists of node coordinates
    Xn_strip = [pos[k][0] for k in sorted(pos.keys()) if k in df_yelp.index[df_yelp.is_strip == True]]
    Yn_strip = [pos[k][1] for k in sorted(pos.keys()) if k in df_yelp.index[df_yelp.is_strip == True]]
    Xn_notstrip = [pos[k][0] for k in sorted(pos.keys()) if k in df_yelp.index[df_yelp.is_strip == False]]
    Yn_notstrip = [pos[k][1] for k in sorted(pos.keys()) if k in df_yelp.index[df_yelp.is_strip == False]]
    thresh_to_XnYn[thresh] = (Xn_strip, Yn_strip, Xn_notstrip, Yn_notstrip)

In [32]:
# concatenating the 2 dataframes
df = pd.concat([df, df_yelp], axis=1)

In [33]:
df.stars = df.stars.astype(str)
df.stars = df.stars + ' stars'

In [34]:
df['temp_id'] = df.index
df['topic1_for_disp'] = topic1_for_disp
df['topic2_for_disp'] = topic2_for_disp
df['topic3_for_disp'] = topic3_for_disp

# map threshold value to a list of nodes left over in the graph after isolate removal
thresh_to_nodenums = {}
for thresh in thresh_to_graph:
    graph = thresh_to_graph[thresh]
    nodenums = list(graph.nodes())
    thresh_to_nodenums[thresh] = nodenums
    
# format of thresh_to_XnYn's output:
# thresh_to_XnYn[thresh] = (Xn_strip, Yn_strip, Xn_notstrip, Yn_notstrip)

stacked_df = []
for thresh in thresh_to_XnYn:
    # for each threshold, create a copy of df
    df_temp = df.copy()
    
    # positions of nodes for the graph
    Xn_strip, Yn_strip, Xn_notstrip, Yn_notstrip = thresh_to_XnYn[thresh]
    # nodes left after removing isolates
    nodenums = thresh_to_nodenums[thresh]
    
    df_temp['threshold'] = thresh
    # initialize Xn and Yn
    df_temp['Xn'] = np.nan
    df_temp['Yn'] = np.nan
    # fill in X and Y positions of nodes for non-isolate nodes
    df_temp.loc[(df_temp.is_strip == True) & (df_temp.temp_id.isin(nodenums)), 'Xn'] = Xn_strip
    df_temp.loc[(df_temp.is_strip == True) & (df_temp.temp_id.isin(nodenums)), 'Yn'] = Yn_strip
    df_temp.loc[(df_temp.is_strip == False) & (df_temp.temp_id.isin(nodenums)), 'Xn'] = Xn_notstrip
    df_temp.loc[(df_temp.is_strip == False) & (df_temp.temp_id.isin(nodenums)), 'Yn'] = Yn_notstrip
    
    stacked_df.append(df_temp)
    
# vertically stack the dataframes in stacked_df
df_final = pd.concat(stacked_df, axis=0, ignore_index=True)

In [35]:
df_final.shape

(1896, 37)

In [36]:
df_final.dtypes

NAME                object
PAGE                object
IMG_URL             object
LOGP               float64
PKA                float64
MW                 float64
FORM                object
SOL                float64
DESC                object
name                object
is_strip            object
stars               object
Topic1             float64
Topic2             float64
Topic3             float64
Topic4             float64
Topic5             float64
Topic6             float64
Topic7             float64
Topic8             float64
Topic9             float64
Topic10            float64
Topic11            float64
Topic12            float64
Topic13            float64
Topic14            float64
Topic15            float64
Topic16            float64
Topic17            float64
Topic18            float64
temp_id              int64
topic1_for_disp     object
topic2_for_disp     object
topic3_for_disp     object
threshold          float64
Xn                 float64
Yn                 float64
d

In [37]:
df_final.drop(['NAME', 'FORM', 'SOL', 'DESC', 'LOGP', 'PKA', 'MW'], axis=1, inplace=True)

In [38]:
df_final.rename(columns={'name': 'NAME'}, inplace=True)

In [39]:
df_final.dtypes

PAGE                object
IMG_URL             object
NAME                object
is_strip            object
stars               object
Topic1             float64
Topic2             float64
Topic3             float64
Topic4             float64
Topic5             float64
Topic6             float64
Topic7             float64
Topic8             float64
Topic9             float64
Topic10            float64
Topic11            float64
Topic12            float64
Topic13            float64
Topic14            float64
Topic15            float64
Topic16            float64
Topic17            float64
Topic18            float64
temp_id              int64
topic1_for_disp     object
topic2_for_disp     object
topic3_for_disp     object
threshold          float64
Xn                 float64
Yn                 float64
dtype: object