# TA Dataset Interactive Visualization (Bokeh)

In [None]:
#! /usr/bin/env python3
# coding: utf-8

import logging
import pandas as pd
import numpy as np

#For Bokeh plotting
from bokeh.io import output_file, show, curdoc, output_notebook, reset_output
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, CategoricalColorMapper, HoverTool, WMTSTileSource
from bokeh.models.widgets import CheckboxGroup, CheckboxButtonGroup, Button, Select, Paragraph
from bokeh.layouts import widgetbox, row, column #,gridplot, Tabs, Panel
from bokeh.transform import jitter
from bokeh.palettes import Spectral11
from bokeh import tile_providers

#Bokeh server
from bokeh.server.server import Server
from bokeh.application import Application
from bokeh.application.handlers.function import FunctionHandler

output_notebook()

In [None]:
!pip list --format=legacy > requirements.txt

In [None]:
#Enable display of info messages
logging.basicConfig(level=logging.INFO)

In [None]:
!jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

In [None]:
reset_output()

## Loading of the Dataset

In [None]:
#load dataset
dataset = pd.read_csv('TA_restaurants_curated.csv', encoding='utf8', index_col=0)
print(dataset.head(), '\n')
#print(dataset.tail(), '\n')
#print(dataset.info())

#Replace NaN values by 'Unknown'
dataset['Cuisine Style'] = dataset['Cuisine Style'].astype(list)
#dataset['Ranking'] = dataset['Ranking'].fillna('Unknown')
dataset['Rating'] = dataset['Rating'].fillna('?').astype(str)
dataset['Price Range'] = dataset['Price Range'].fillna('Unknown').astype(str)
dataset['Number of Reviews'] = dataset['Number of Reviews'].fillna(0)
print('\n'+'Dataset info after fillna()')
print(dataset.info())

## Interactive visualization of the dataset
One graph that shows global data by city (using Bokeh) like number of restaurants, number of reviews, by rates, etc.
- Global: all restaurants individual properties
- Aggregated by cities : restaurants properties aggregated to the scale of the city

### Global
- Plot: display x:rank, y: number of reviews, color:price range
- Bokeh Server (interactive app) : 
     - choose price range (legend), 
     - rate and city (widgets), 
     - buttons to reselect all, unselect all, refresh plot
     - automatic resizing of the plot according to min and max values
- 2 plots with linked axes for visualizing as scatter categorical plot x: nbr reviews, y: price range/rate
     
#### Simple Plot

In [None]:
#SImple Bokeh Plot
plot = figure(plot_width=800, plot_height=600, x_axis_label='Rank of the restaurant', 
              y_axis_label='Number of reviews', title='Global restaurants overview',
              tools='pan,box_zoom,wheel_zoom,tap,save,lasso_select')
#Data sources for each price range category, in order to have separate legends and hide them
price_ranges = dataset['Price Range'].unique().tolist()
source_low = ColumnDataSource(dataset[dataset['Price Range'] == '$'])
source_mid = ColumnDataSource(dataset[dataset['Price Range'] == '$$ - $$$'])
source_high = ColumnDataSource(dataset[dataset['Price Range'] == '$$$$'])
source_unknown = ColumnDataSource(dataset[dataset['Price Range'] == 'Unknown'])
sources = [source_low, source_mid, source_high, source_unknown]
#Separated glyphs for price range in order to use legen hide function
for pr, color, source in zip(price_ranges, ['yellow', 'red', 'green', 'gray'], sources):
    plot.circle('Ranking', 'Number of Reviews', source=source, color=color, size=4, alpha=0.5, legend=pr)
    #Hover tool hat displays name and city over point
    hover = HoverTool(tooltips=[('Restaurant Name', '@Name' ), ('City', '@City'), 
                                ('Rate', '@Rating'), ('Rank', '@Ranking')])
    plot.add_tools(hover)  
#Legend customization
plot.legend.location='top_right'
plot.legend.click_policy="hide"

output_notebook()
show(plot, notebook_handle=True)

#### Bokeh App (Bokeh Server)
*The server disconnect when the entire dataset is taken for the plot, but not when slicing iloc[:50000] for example*

INFO:bokeh.server.views.ws:WebSocket connection closed: code=None, reason=None

In [None]:
#BOKEH SERVER with interactive widgets to choose city, price range or rate (checkbox)
def make_document(document):
    #Load dataset
    dataset = pd.read_csv("C://Users/Damien/Documents/Python Dev/Trainings/Tripadvisor_scrapping/TA_restaurants_curated.csv",
                          encoding='utf8', index_col=0)
    dataset['Rating'] = dataset['Rating'].fillna('?').astype(str)
    dataset['Price Range'] = dataset['Price Range'].fillna('?').astype(str)
    dataset['Number of Reviews'] = dataset['Number of Reviews'].fillna(0)
    
    dataset.iloc[:50000]  #The Bokeh Server does not accept the entire dataset when run on the computer
    
    print(dataset.info())
    
    #Plot figure
    plot = figure(plot_width=900, plot_height=600, x_axis_label='Rank of the restaurant', 
                  y_axis_label='Number of reviews',
                  x_range=(0, int(dataset['Ranking'].max().tolist())),
                  y_range=(0, int(dataset['Number of Reviews'].max().tolist())+500),
                  title='Global restaurants overview',
                  tools='pan,box_zoom,wheel_zoom,tap,save,lasso_select')
    #Hover tool hat displays name and city over point
    hover = HoverTool(tooltips=[('Restaurant Name', '@Name' ), ('City', '@City'), 
                                ('Rate', '@Rating'), ('Rank', '@Ranking')])
    #Data sources for each price range category, in order to have separate legends and hide them
    price_ranges = dataset['Price Range'].unique().tolist()
    source_low = ColumnDataSource(dataset[dataset['Price Range'] == '$'])
    source_mid = ColumnDataSource(dataset[dataset['Price Range'] == '$$ - $$$'])
    source_high = ColumnDataSource(dataset[dataset['Price Range'] == '$$$$'])
    source_unknown = ColumnDataSource(dataset[dataset['Price Range'] == 'Unknown'])
    sources = [source_low, source_mid, source_high, source_unknown]
    #Separated glyphs for price range in order to use legen hide function
    for pr, color, source in zip(price_ranges, ['yellow', 'red', 'green', 'gray'], sources):
        plot.circle('Ranking', 'Number of Reviews', source=source, color=color, size=4, alpha=0.5, legend='Price Range')
        plot.add_tools(hover)  
    #Legend customization
    plot.legend.location='top_right'
    plot.legend.orientation="horizontal"
    plot.legend.click_policy="hide"
    
    #Checkboxes objects added to the layout
    cities_list = dataset['City'].unique().tolist()
    rates_list = dataset['Rating'].unique().astype(str).tolist()
    checkbox_cities = CheckboxGroup(labels=cities_list, active=[])
    checkbox_rates = CheckboxGroup(labels=rates_list, active=[])
    #Select_all  and refresh buttons
    button_all = Button(label='Select all parameters')
    button_refresh = Button(label='Refresh plot')
    button_none = Button(label='Select none')
    
    #Callback functions
    def select_all():
        checkbox_cities.active = list(range(len(cities_list)))
        checkbox_rates.active = list(range(len(rates_list)))
    button_all.on_click(select_all)
    
    def select_none():
        checkbox_cities.active = []
        checkbox_rates.active = []
    button_none.on_click(select_none)        
    
    def refresh():
        #Get choices from checkboxes (indexes of activated boxes)
        cities_choice = [cities_list[i] for i in checkbox_cities.active]    
        rates_choice = [rates_list[i] for i in checkbox_rates.active]
        print(cities_choice, rates_choice)
        #update sources of the glyphs by filtering the dataset according to cities and rates selected
        filtered_dataset = dataset[dataset['City'].isin(cities_choice) & dataset['Rating'].isin(rates_choice)]
        print(filtered_dataset.head(10))
        source_low.data = ColumnDataSource(filtered_dataset[filtered_dataset['Price Range'] == '$']).data
        source_mid.data = ColumnDataSource(filtered_dataset[filtered_dataset['Price Range'] == '$$ - $$$']).data
        source_high.data = ColumnDataSource(filtered_dataset[filtered_dataset['Price Range'] == '$$$$']).data
        source_unknown.data = ColumnDataSource(filtered_dataset[filtered_dataset['Price Range'] == 'Unknown']).data
        layout.children[1] = create_figure()
    button_refresh.on_click(refresh)
    
    #Layout
    layout = row(widgetbox(checkbox_cities, width=100), widgetbox(checkbox_rates, width=60),
                   column(row(widgetbox(button_all), widgetbox(button_none),widgetbox(button_refresh)), plot))
    document.add_root(layout)
    document.title = "Global Restaurants Information Visualization"

#Server
apps = {'/': Application(FunctionHandler(make_document))}
server = Server(apps, port=5001)
server.start()  #Then open browser at adress "localhost:50xx"

#### 2 Categorical scatter plots with axes linked:
- x: number of reviews, y: price range
- x: number or reviews, y: rate

In [None]:
global_source = ColumnDataSource(dataset)
hover = HoverTool(tooltips=[('Restaurant Name', '@Name' ), ('City', '@City'), 
                            ('Rate', '@Rating'), ('Rank', '@Ranking')])

#First plot
price_ranges = dataset['Price Range'].unique().tolist()
plot2 = figure(plot_width=800, plot_height=400, x_axis_label='Number of Reviews', 
              y_axis_label='Price Range', y_range=price_ranges,
               title='Global restaurants overview by Number of reviews and Price Range',
               tools='pan,box_zoom,wheel_zoom,tap,save,lasso_select')
plot2.circle(x='Number of Reviews', y=jitter('Price Range', width=0.6, range=plot2.y_range), 
             source=global_source, size=4, alpha=0.5)
plot2.add_tools(hover)
    
#Second plot
rates = dataset['Rating'].unique().astype(str).tolist()
plot3 = figure(plot_width=800, plot_height=400, x_axis_label='Number of Reviews', 
              y_axis_label='Rate', y_range=rates, 
               title='Global restaurants overview by Number of Revies and Rate',
               tools='pan,box_zoom,wheel_zoom,tap,save,lasso_select')
plot3.circle(x='Number of Reviews', y=jitter('Rating', width=0.6, range=plot3.y_range), 
             source=global_source, size=4, alpha=0.5)
plot3.add_tools(hover)

#Linking of the axis
layout2 = column(plot2, plot3)
plot2.x_range = plot3.x_range

show(layout2)

### Aggregated by city
- aggregate dataset accordingly with different methods for each columns (count, sum, average, etc.)
    - name: count 
    - number of reviews: sum, average, max (min is 0)
    - cuisine_styles: custom_count (dict object), 
    - price range & rate as index (categories)
    - rank, cuisine styles: no aggregation
    
        
- Interactive plot:
    - widget to choose the country to display
    - repartitions according to price range & rate
- Interactive map: for each city: display info by hovering
    http://nbviewer.jupyter.org/github/bokeh/bokeh-notebooks/blob/master/tutorial/09%20-%20Geographic%20Plots.ipynb
    
#### Aggregation of the dataset per city


In [None]:
#Aggregated dataset creation using dict aggregation rule
agg_dict = {'City': 'count', 'Number of Reviews': ['sum','mean', 'max']}
agg_dataset = dataset.groupby('City').agg(agg_dict)

pr_df = pd.DataFrame({'Price Range':dataset.groupby(['City','Price Range']).count()['Name']}).unstack()
rate_df = pd.DataFrame({'Rate': dataset.groupby(['City', 'Rating']).count()['Name']}).unstack()

agg_dataset = pd.concat([agg_dataset, pr_df, rate_df], axis=1)

print(agg_dataset.head())
print(agg_dataset.info())

#### Visualization of aggregated data for each city (Bokeh app)
- total number of restaurants and reviews
- bar chart for price range repartition
- bar chart for rate repartition
- dropdowwn menu for city selection

In [None]:
price_ranges_bars = ['$', '$$ - $$$', '$$$$', 'Unknown']
rates =sorted(dataset['Rating'].unique().tolist()[:-1])+['?']
flat_column_labels = ['Total_Restaurants', 'Sum_Reviews', 'Mean_Reviews', 'Max_Reviews']+price_ranges_bars+rates

def make_bars(bars):
    #default source is Krakow
    source_df = agg_dataset.loc[['Krakow']]
    source_df.columns = flat_column_labels
    #print(source_df)
    source_city = ColumnDataSource(source_df)

    #bar charts with Price Range vbar glyphs
    plot_pr = figure(plot_width=700, plot_height=300, title="Retaurants repartition per price range",
                     x_range=price_ranges_bars, x_axis_label="Price Range")
    k=0.5
    for pr, color in zip(price_ranges_bars, ['yellow', 'red', 'green', 'gray']):
        plot_pr.vbar(x=k, top=pr, bottom=0, source=source_city, width=1, fill_color=color, legend=pr)
        k += 1

    #Bar chart with Rate using vbar glyphs
    plot_rates = figure(plot_width=700, plot_height=300, title="Retaurants repartition per rate",
                     x_range=rates, x_axis_label="Rate")
    n = 0.5
    for rate, color in zip(rates, Spectral11):
        plot_rates.vbar(x=n, top=rate, source=source_city, width=1, legend=rate, color=color)
        n +=1
    plot_rates.legend.orientation='horizontal'
    plot_rates.legend.location='top_left'

    #Text widgets with total number of restaurants & number of reviews
    text_info = Paragraph(text='Total number of restaurants: {}'.format(source_df['Total_Restaurants'].values[0]),
                          width=200, height=150)
    text_info2 = Paragraph(text='Total number of reviews: {}'.format(source_df['Sum_Reviews'].values[0]),
                           width=200, height=150)

    #Dropdown menu for city choice
    menu = Select(options=agg_dataset.index.tolist(), value='Krakow', 
                  title="City to display", width=200)

    #Callback function that updates the city from the menu
    def update_city(attr, old, new):
        city = menu.value
        source_df = agg_dataset.loc[[city]]
        source_df.columns = flat_column_labels
        source_city.data = ColumnDataSource(source_df).data
    menu.on_change('value', update_city)
    
    #Global layout
    layout = row(column(widgetbox(menu), text_info, text_info2), 
                 column(plot_pr, plot_rates))
    
    bars.add_root(layout)
    bars.title = "Restaurants repartition by price range and rates per city"

apps = {'/': Application(FunctionHandler(make_bars))}
server = Server(apps, port=5005)
server.start()

#### Visualization on map
http://nbviewer.jupyter.org/github/bokeh/bokeh-notebooks/blob/master/tutorial/09%20-%20Geographic%20Plots.ipynb

In [None]:
#Set map background: Lsbon: #-1.010781e+06, 4.640350e+06  | Helsinki: 2.734619e+06 , 8.422571e+06
Europe = x_range, y_range = ((-1100000,2700000), (4600000,8500000))  
map = figure(tools='pan,wheel_zoom', x_range=x_range, y_range=y_range)
map.axis.visible = False
url = 'http://a.basemaps.cartocdn.com/pitney-bowes-streets/{Z}/{X}/{Y}.png'
attribution = "Tiles by Carto, under CC BY 3.0. Data by OSM, under ODbL"
map.add_tile(WMTSTileSource(url=url, attribution=attribution))

#Add decimal longitude/latitude to dataframe (Wikipedia)
coord_df = pd.DataFrame(dict(lat=[52.23, 37.58, 41.2257, 52.5178,48.0841, 50.50,47.2954, 55.4124,53.2036,
                      55.5717,46.1200,53.33,60.1024,50.04,38.43,46.0305,51.3026,49.3636, 45.4535,
                      40.26,45.28,48.09,41.08,59.5440,48.5124,50.0516,41.5319,59.1946,48.1230,
                      52.1356,47.2240],
                  lon=[4.54, 23.43, 2.1037, 13.4056,17.0646,4.21, 19.0227,12.3509,-6.1603,
                      -3.1206,6.09,10,24.5655,19.57,-9.08,14.3021,-0.0739,6.08,4.5032,
                      -3.41,9.10,11.3430,-8.4000,10.4510,2.2107,14.2514,12.2912,18.0407,16.2223,
                      21.0003,8.3228]))
coord_df.index = agg_dataset.index.tolist()
map_df = pd.concat([agg_dataset, coord_df], axis=1)
map_df.columns = flat_column_labels + ['lat', 'lon']  #non multi-index for map hover tool

#Convert decimal coordinates into web mercator system
k = 6378137
map_df["x"] = map_df['lon'] * (k * np.pi/180.0)
map_df["y"] = np.log(np.tan((90 + map_df['lat']) * np.pi/360.0)) * k

#Add glyphs for each city and hover tool
source_map = ColumnDataSource(map_df)
map.circle('x', 'y',source=source_map, color='green', size=10)
hover = HoverTool(tooltips=[('City', '@City' ), ('Number of restaurants:', '@Total_Restaurants'), 
                            ('Total Reviews', '@Sum_Reviews')])
map.add_tools(hover)

show(map)