# Setup and load data

In [5]:
# %% libraries
import pandas as pd
import numpy as np

from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper
from bokeh.palettes import Category10_5, Category20_16

In [7]:
# %% load data
url_map = "https://raw.githubusercontent.com/WillKoehrsen/Bokeh-Python-Visualization/master/bokeh_app/data/flights_map.csv"
url_flight = "https://raw.githubusercontent.com/WillKoehrsen/Bokeh-Python-Visualization/master/bokeh_app/data/flights.csv"

carrier_names = pd.read_csv(url_map)
flights = pd.read_csv(url_flight)

In [8]:
# %% focusing on 1 single variable only
# summary statistic for arrival delay
flights['arr_delay'].describe()

count    327346.000000
mean          6.895377
std          44.633292
min         -86.000000
25%         -17.000000
50%          -5.000000
75%          14.000000
max        1272.000000
Name: arr_delay, dtype: float64

# Histogram
- The first graph we will make is a simple histogram of the arrival delay. We will consider all airlines on the same plot.

## Data for plotting

In [9]:
# Bins will be five minutes in width, limit delays to [-2, +2] hours
arr_hist, edges = np.histogram(flights['arr_delay'], bins = int(240/5), range = [-120, 120])

In [14]:
arr_hist

array([    0,     0,     0,     0,     0,     0,     1,     0,     1,
           7,    48,   142,   276,   636,  1394,  2820,  5339,  9420,
       15551, 23546, 31319, 34857, 33790, 29786, 24391, 19028, 14894,
       11324,  8815,  7159,  5908,  4896,  4130,  3630,  3139,  2782,
        2455,  2179,  1973,  1805,  1646,  1454,  1360,  1286,  1167,
        1013,   945,  1000], dtype=int64)

In [15]:
edges

array([-120., -115., -110., -105., -100.,  -95.,  -90.,  -85.,  -80.,
        -75.,  -70.,  -65.,  -60.,  -55.,  -50.,  -45.,  -40.,  -35.,
        -30.,  -25.,  -20.,  -15.,  -10.,   -5.,    0.,    5.,   10.,
         15.,   20.,   25.,   30.,   35.,   40.,   45.,   50.,   55.,
         60.,   65.,   70.,   75.,   80.,   85.,   90.,   95.,  100.,
        105.,  110.,  115.,  120.])

In [17]:
# Set up the figure
p = figure(plot_width = 500, plot_height = 500, title = 'Histogram of Arrival Delays',
          x_axis_label = 'Minutes', y_axis_label = 'Count')

# Add a quad glyph
p.quad(bottom=0, top=arr_hist, left=edges[:-1], right=edges[1:], fill_color='red', line_color='black')

# To show in notebook
output_notebook()

# Show the plot
show(p)

# Add basic style

In [18]:
def style(p):
    p.title.align = 'center'
    p.title.text_font_size = '18pt'
    p.xaxis.axis_label_text_font_size = '12pt'
    p.xaxis.major_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'
    p.yaxis.major_label_text_font_size = '12pt'
    
    return p

In [19]:
styled_p = style(p)

show(styled_p)

# Column data source

In [20]:
# getting a df with the variables we want
arr_df = pd.DataFrame({'count': arr_hist, 'left': edges[:-1], 'right': edges[1:]})
arr_df['f_count'] = ['%d flights' % count for count in arr_df['count']]
arr_df['f_interval'] = ['%d to %d minutes' % (left, right) for left, right in zip(arr_df['left'], arr_df['right'])]

arr_df.head()

Unnamed: 0,count,left,right,f_count,f_interval
0,0,-120.0,-115.0,0 flights,-120 to -115 minutes
1,0,-115.0,-110.0,0 flights,-115 to -110 minutes
2,0,-110.0,-105.0,0 flights,-110 to -105 minutes
3,0,-105.0,-100.0,0 flights,-105 to -100 minutes
4,0,-100.0,-95.0,0 flights,-100 to -95 minutes


In [23]:
# Create a ColumnDataSource object
arr_src = ColumnDataSource(arr_df)
arr_src.data.keys()

dict_keys(['index', 'count', 'left', 'right', 'f_count', 'f_interval'])

ColumnDataSource is the object where the data of a Bokeh graph is stored. 

You can choose not to use a ColumnDataSource and feed your graph directly with Python dictionaries, pandas dataframes, etc, but for certain features such as having a popup window showing data information when the user hovers the mouse on glyphs, you are forced to use a ColumnDataSource otherwise the popup window will not be able to get the data. Other uses would be when streaming data.

You can create a ColumnDataSource from dictionaries and pandas dataframes and then use the ColumnDataSource to create the glyphs.

# Add in tooltips on Hover

In [24]:
# Set up the figure same as before
p = figure(plot_width = 500, plot_height = 500, title = 'Histogram of Arrival Delays',
          x_axis_label = 'Minutes', y_axis_label = 'Count')

# Add a quad glyph with source this time
p.quad(bottom=0, top='count', left='left', right='right', source=arr_src,  # specify the data source is from a DCS
       fill_color='red', line_color='black')

# Add style to the plot
styled_p = style(p)

# Add a hover tool referring to the formatted columns
hover = HoverTool(tooltips = [('Delay', '@f_interval'),   # what you want to show in the hover tool
                              ('Count', '@f_count')])

# Add the hover tool to the graph
styled_p.add_tools(hover)

# Show the plot
show(styled_p)


# Some data cleaning which is not important for Bokeh visulisation

In [26]:
# Group by the carrier to find the most common
carrier_nums = flights.groupby('carrier')['year'].count().sort_values(ascending=False)

# Subset to the 8 most common carriers
flights = flights[flights['carrier'].isin(carrier_nums.index[:5])]

# Subset to only [-2, +2] hour delays
flights = flights[(flights['arr_delay'] >= -120) & (flights['arr_delay'] <= 120)]

# find acutual carrier names
carrier_names = pd.read_csv(url_map)

# merge data 
flights = flights.merge(carrier_names, how = 'left', on = 'carrier')
flights.head()

Unnamed: 0,Unnamed: 0_x,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay_x,...,distance.2,distance.3,air_speed (mph),air_speed (mph).1,air_speed (mph).2,air_speed (mph).3,start_long,start_lati,end_long,end_lati
0,0,2013,1,1,517.0,515,2.0,830.0,819,11.0,...,,,,,,,,,,
1,1,2013,1,1,533.0,529,4.0,850.0,830,20.0,...,,,,,,,,,,
2,2,2013,1,1,542.0,540,2.0,923.0,850,33.0,...,,,,,,,,,,
3,3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,...,,,,,,,,,,
4,4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,...,,,,,,,,,,
