In [45]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import Arrow, NormalHead, Label, HoverTool, CustomJS, ColumnDataSource, VBar

output_notebook()

**Grab and format data**

In [2]:
datapd = pd.read_csv("master_df_bloom_figure.csv")

datapd = datapd.sort_values(by=["year"])
datapd = datapd.dropna()
datapd = datapd[:-1] #drop incomplete 2016 data

**Group data and grab boxplot quantities**

In [3]:
gb = datapd.groupby(["year", "type"])

In [4]:
q1 = gb.quantile(q=0.25)
q2 = gb.quantile(q=0.5)
q3 = gb.quantile(q=0.75)

iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr

In [5]:
# finding outliers

def outliers(group):
    cat = group.name
    return group[(group.gpw > upper.loc[cat]['gpw'])\
                 | (group.gpw < lower.loc[cat]['gpw'])]['gpw']

out = gb.apply(outliers).dropna()

# prepare outlier data for plotting, we need coordinates for 
# every outlier.

years = list(set(datapd.year))
if out.size>0:
    outx_sc = []
    outy_sc = []
    outx_nv = []
    outy_nv = []
    for year in years:
        # only add outliers if they exist
     
        vals_sc = out.loc[year]["supercomputer"]
        vals_nv = out.loc[year]["NVIDIA GPU"]
        if vals_sc.size>0:
            for value in out[year, "supercomputer"]:
                outx_sc.append(year)
                outy_sc.append(value)
        if vals_nv.size>0:
            for value in out[year, "NVIDIA GPU"]:
                outx_nv.append(year)
                outy_nv.append(value)

**Setting up Bokeh plot**

In [99]:
# Create figure
p = figure(tools="save", background_fill_color="#EFE8E2", title="", 
           x_range=(2006,2016), y_axis_type="log", y_range=(0.01, 15**5),
           plot_width=800, plot_height=500)

# We want to offset one of the datasets
years1 = [year-.25 for year in years]
years2 = [year+.25 for year in years]

# if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = gb.quantile(q=0.00)
qmax = gb.quantile(q=1.00)
upper.gpw = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'gpw']),upper.gpw)]
lower.gpw = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'gpw']),lower.gpw)]

# stems
p.segment(years1, upper.gpw[:,"NVIDIA GPU"], 
          years1, q3.gpw[:,"NVIDIA GPU"], line_color="black", 
          legend="NVIDIA GPU")
p.segment(years1, lower.gpw[:,"NVIDIA GPU"], 
          years1, q1.gpw[:,"NVIDIA GPU"], line_color="black", 
          legend="NVIDIA GPU")

p.segment(years2, upper.gpw[:,"supercomputer"], 
          years2, q3.gpw[:,"supercomputer"], line_color="black",
          legend="supercomputer")
p.segment(years2, lower.gpw[:,"supercomputer"], 
          years2, q1.gpw[:,"supercomputer"], line_color="black",
          legend="supercomputer")

# boxes
source1 = ColumnDataSource(data=dict(m=gb.mean().gpw[:,"NVIDIA GPU"], 
                                     s=gb.std().gpw[:,"NVIDIA GPU"], 
                                     x1=years1, 
                                     bottom1=q1.gpw[:,"NVIDIA GPU"], 
                                     top1=q3.gpw[:, "NVIDIA GPU"]))
source2 = ColumnDataSource(data=dict(m=gb.mean().gpw[:,"supercomputer"], 
                                     s=gb.std().gpw[:,"supercomputer"], 
                                     x2=years2, 
                                     bottom2=q1.gpw[:,"supercomputer"], 
                                     top2=q3.gpw[:, "supercomputer"]))

b1 = p.vbar(x="x1", width=0.35, bottom="bottom1", top="top1", 
             fill_color="#E08E79", legend="NVIDIA GPU", line_color="black",
             hover_alpha=0.2, source=source1)
p.segment([i-.175 for i in years1], q2.gpw[:,"NVIDIA GPU"], 
          [i+.175 for i in years1], q2.gpw[:,"NVIDIA GPU"], 
          line_color="black", legend="NVIDIA GPU")

b2 = p.vbar(x="x2", width=0.35, bottom="bottom2", top="top2", 
             fill_color="#3B8686", legend="supercomputer", line_color="black",
             hover_alpha=0.2, source=source2)
p.segment([i-.175 for i in years2], q2.gpw[:,"supercomputer"], 
          [i+.175 for i in years2], q2.gpw[:,"supercomputer"], 
          line_color="black", legend="supercomputer")


# whiskers
wyears11 = [year-.125 for year in years1]
wyears12 = [year+.125 for year in years1]
wyears21 = [year-.125 for year in years2]
wyears22 = [year+.125 for year in years2]

p.segment(wyears11, lower.gpw[:, "NVIDIA GPU"], 
          wyears12, lower.gpw[:, "NVIDIA GPU"], line_color="black",
          legend="NVIDIA GPU")
p.segment(wyears11, upper.gpw[:, "NVIDIA GPU"], 
          wyears12, upper.gpw[:, "NVIDIA GPU"], line_color="black",
          legend="NVIDIA GPU")

p.segment(wyears21, lower.gpw[:, "supercomputer"], 
          wyears22, lower.gpw[:, "supercomputer"], line_color="black",
          legend="supercomputer")
p.segment(wyears21, upper.gpw[:, "supercomputer"], 
          wyears22, upper.gpw[:, "supercomputer"], line_color="black", 
          legend="supercomputer")


# outliers
if out.size>0:
    p.diamond([i+.25 for i in outx_sc], outy_sc, size=6, color="black", 
             fill_alpha=1, line_alpha=0, legend="supercomputer")
    p.diamond([i-.25 for i in outx_nv], outy_nv, size=6, color="black", 
             fill_alpha=1, line_alpha=0, legend="NVIDIA GPU")
    
# add human line segment
p.segment([2006], [10**5], [2016], [10**5], line_dash="dashed", 
          line_color="black", legend="human")

# add legend
p.legend
p.legend.location = (50,225)
p.legend.border_line_width = 0
p.legend.background_fill_alpha = 0

# add TrueNorth, GTPU points
p.circle(2015.5, 657.14, color="red", size=10)
p.diamond(2015.75, 100, color="blue", size=15)

p.add_layout(Arrow(end=NormalHead(line_width=2, size=5), x_start=2013.5, 
                   y_start=6e3, x_end=2015.4, y_end=690, line_width=2, 
                   line_alpha=0.5))

p.add_layout(Arrow(end=NormalHead(line_width=2, size=5), x_start=2014, 
                   y_start=1.2e2, x_end=2015.65, y_end=1e2, line_width=2, 
                   line_alpha=0.5))

TNlabel = Label(x=2012.5, y=1e4, text="IBM TrueNorth")
GTPUlabel = Label(x=2012, y=1.5e2, text="Google Tensor Processing Unit")
p.add_layout(TNlabel)
p.add_layout(GTPUlabel)

In [101]:
# set up global figure attributes
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.grid.grid_line_width = 2
p.xaxis.major_label_text_font_size="12pt"
p.xaxis.ticker = [2007,2008,2009,2010,2011,2012,2013,2014,2015]
p.xaxis.axis_label = "Year"
p.yaxis.axis_label = "GFlops per Watt"
p.xaxis.axis_label_text_font_size = "15pt"
p.yaxis.axis_label_text_font_size = "15pt"
p.xaxis.axis_label_text_font_style = "bold"
p.yaxis.axis_label_text_font_style = "bold"

# add a little bit of dynamic interaction

# turn on/off the plots of diff datasets
p.legend.click_policy = "hide" #interactive legend

# Hovering over a boxplot will output the distr. mean and std. 
p.add_tools(HoverTool(tooltips=[("μ", "@m"), ("σ", "@s")], 
                      renderers=[b1, b2]))

show(p)

The dynamic interactions add some utility to the plot. Being able to clear the figure of the *supercomputer*/*GPU* boxplots on demand allows you to focus more on the individual trends. And the hovertools output of the mean and std of the boxplot distributions provides additional information that is helpful perhaps particularly for those boxplots where the large amount of outliers obscures the underlying information being presented.

In truth, for this particular plot, these dynamic interactions don't add a ton to the plot's presentation. However, they are powerful tools and I think this example is at least illustrative of how they might be used for other figures where they are more natural (eg. denser data sets).