Youtube video for this notebook

https://youtu.be/AmHz31GrvkA

In [None]:
import numpy as np
import pandas as pd

import holoviews as hv
import hvplot.pandas

colors = {
    'Adelie Penguin': '#1f77b4',
    'Gentoo penguin': '#ff7f0e',
    'Chinstrap penguin': '#2ca02c'
}

# Scalable Cross-filtering Dashboards with Panel, HoloViews, and hvPlot

### PyData Global 2020
### Philipp Rudiger and James A. Bednar
### Anaconda Inc.

## What is cross-filtering?

- Selecting data in one plot and having the selection reflected in other plots
- Also known as "linked brushing"
- Allows understanding multidimensional datasets by showing how many different views of the same data relate

<h1><b style="font-size: 0.6em">Scalable</b> Cross-filtering <b style="font-size: 0.6em;">Dashboards</b> with <b style="font-size: 0.6em">Panel,</b> HoloViews and hvPlot</h1>

In this first example for building interactive plots we start with 3 main libraries:
    
1. **Pandas**: To load and manipulate the data
2. **HoloViews**: To link selections between plots automatically
3. **hvPlot**: To quickly generate plots using a simple and familiar API

<div style="display: table; vertical-align: center;">
<img src="./pandas-logo-300.png" style="float:left; white-space: nowrap;" width="200px"></img>
<img src="./holoviews.jpg"  style="float:left; white-space: nowrap;" width="210px"></img>
<img src="./hvplot-wm.png"  style="float:left; white-space: nowrap;" width="120px"></img>
</div>

## Building some plots

Let us first load the Palmer penguin dataset ([Gorman et al.](https://allisonhorst.github.io/palmerpenguins/)), which contains measurements about a number of penguin species:

In [None]:
penguins = pd.read_csv('../data/penguins.csv')
penguins.head(3)

## Building some plots

In [None]:
scatter = penguins.hvplot.scatter(
    'Culmen Length (mm)', 'Culmen Depth (mm)', c='Species', cmap=colors, frame_width=400
)
scatter

## Building some plots

In [None]:
histogram = penguins.hvplot.hist(
    'Body Mass (g)', by='Species', color=hv.dim('Species').categorize(colors),
    legend=False, alpha=0.5, frame_width=400
)
histogram

## Link selections

In [None]:
ls1 = hv.link_selections.instance()

hv.link_selections(histogram + scatter)

## Other plot types

In [None]:
# bars = (penguins.hvplot.bar('Species', 'Individual ID', c='Species', cmap=colors)
#         .aggregate(function=np.count_nonzero))
violin = (penguins.hvplot.violin('Flipper Length (mm)', by=['Species', 'Sex'], cmap='Category20')
          .opts(split='Sex'))

hv.link_selections(scatter.opts(show_legend=False) + histogram + histogram + violin).cols(2)

## Other data types

In [None]:
temp

In [None]:
import hvplot.xarray
import xarray as xr
temp = xr.tutorial.load_dataset('air_temperature')
qmesh = temp.hvplot.quadmesh('lon', 'lat', rasterize=True)
# qmesh = temp.hvplot.points(x='time', y='lat')
hist = temp.air.hvplot.hist('air', by=[], groupby='time')
hv.link_selections(qmesh+hist).cols(1)

<h1><i>Scalable</i> Cross-filtering <b style="font-size: 0.6em;">Dashboards</b> with <b style="font-size: 0.6em">Panel,</b> HoloViews and hvPlot</h1>

* **Dask**: Scale your pandas DataFrame up and out to multiple cores or a whole cluster
* **RAPIDS cuDF**: Move the computation to a GPU
* **Datashader**: Render large datasets outside the browser

<img src="diagram.png" width="80%"></img>

In [None]:
import dask.dataframe as ddf
import panel as pn

library = 'pandas' # 'dask' | 'pandas' | 'cudf'

def load_data(path, library):
    df = ddf.read_parquet(path, engine='fastparquet').persist()
    
    if library != 'dask':
        df = df.compute()
        
    if library == 'cudf': 
        import cudf
        import hvplot.cudf
        df['passenger_count'] = df.passenger_count.astype('int32')
        df['pickup_hour'] = df.pickup_hour.astype('int32')
        df['dropoff_hour'] = df.dropoff_hour.astype('int32')
        df = cudf.from_pandas(df)
    return df

pn.config.sizing_mode = 'stretch_both'

opts = {
    'datashade': True,
    'cmap': 'viridis',
    'xaxis': None,
    'yaxis': None,
    'responsive': True,
    'min_height': 500,
    'shared_axes': False
}

## Loading data

In [None]:
from pathlib import Path

path = Path('../data/nyc_taxi_wide.parq', engine='fastparquet')

df = pd.read_parquet(path)
df.head(3)

## Building the cross-filtering views

In [None]:
ls3 = hv.link_selections.instance()

carto = hv.element.tiles.CartoDark().opts(axiswise=True)
pickup = carto * ls3(df.hvplot.scatter('pickup_x', 'pickup_y', title='Pickup', **opts))
dropoff = carto * ls3(df.hvplot.scatter('dropoff_x', 'dropoff_y', title='Dropoff',  **opts))

pickup + dropoff

## Building the cross-filtering views

In [None]:
hist_ranges = {'trip_distance': (0, 10), 'fare_amount': (0, 10),
               'pickup_hour': (0, 24), 'dropoff_hour': (0, 24)}
hists = pn.Column()
for value, bin_range in hist_ranges.items():
    hist = df.hvplot.hist(value, normed=True, bin_range=bin_range, yaxis=None,
                          height=200, responsive=True)
    hists.append(ls3(hist))
hists

<h1>Scalable Cross-filtering <i>Dashboards</i> with <i>Panel,</i> HoloViews and hvPlot</h1>

</br></br>

<div style="display: flex">
<img src="panel_diagram.png" width="80%"></img>
</div>

## Build the dashboard

In [None]:
df_N = len(df)

def count(data):
    N = len(data)
    return pn.panel('## %d/%d Trips selected - Avg. Trip Distance: %.2f mi' %
                    (N, df_N, data.trip_distance.mean()), width=600)

pn.panel(pn.bind(count, ls3.selection_param(df)))

## Build the dashboard

In [None]:
def selected(data):
    return pn.indicators.Progress(value=int((len(data)/df_N)*100))

pn.panel(pn.bind(selected, ls3.selection_param(df)))

## Build the dashboard

In [None]:
ls4 = hv.link_selections.instance()

sel_param = ls4.selection_param(df)

df_N = len(df)

def count(data):
    N = len(data)
    return pn.pane.HTML(
        '<span style="font-size: 1.5em">%d/%d Trips selected - Avg. Trip Distance: %.2f mi</span>' %
        (N, df_N, data.trip_distance.mean()),
        sizing_mode='fixed', width=600, align='center'
    )

def selected(data):
    return pn.indicators.Progress(value=int((len(data)/df_N)*100))

carto = hv.element.tiles.CartoDark().opts(axiswise=True)
pickup = carto * ls4(df.hvplot.scatter('pickup_x', 'pickup_y', title='Pickup', **opts))
dropoff = carto * ls4(df.hvplot.scatter('dropoff_x', 'dropoff_y', title='Dropoff',  **opts))

hist_ranges = {'trip_distance': (0, 10), 'fare_amount': (0, 10),
               'pickup_hour': (0, 24), 'dropoff_hour': (0, 24)}

hists = pn.Column()
for value, bin_range in hist_ranges.items():
    hist = df.hvplot.hist(value, normed=True, bin_range=bin_range, yaxis=None,
                          height=200, responsive=True)
    hists.append(ls4(hist))

In [None]:
tmpl = pn.template.ReactTemplate(
    title="NYC Taxi - Linked Brushing Demo", logo="taxi_logo.png",
    theme=pn.template.DarkTheme
)

header = pn.Row(
    pn.Spacer(width=200, sizing_mode='fixed'),
    pn.bind(count, sel_param),
    sizing_mode='stretch_width'
)

plots = pn.Column(
    (pickup + dropoff).opts(shared_axes=False)
)

tmpl.header.append(header)
tmpl.main[:4, :6] = pickup
tmpl.main[:4, 6:] = dropoff
for i, hist in enumerate(hists):
    tmpl.main[4:6, 3*i:(3*i)+3] = hist
    
tmpl.show(title='NYC Taxi - Linked Brushing')

# Thank you!

### Special Acknowledgements to Jon Mease for implementing first version of linked selections!

### Looking forward to your questions.