In [1]:
import numpy as np
np.random.seed(42)

import holoviews as hv
hv.notebook_extension('matplotlib')

%opts Points [color_index=2] (cmap="bwr" edgecolors='k' s=50 alpha=1.0)
%opts Scatter3D [color_index=3 fig_size=250] (cmap='bwr' edgecolor='k' s=50 alpha=1.0)
%opts Image (cmap="gray_r") {+axiswise}
%opts RGB [bgcolor="black" show_grid=False]

import holoviews.plotting.mpl
holoviews.plotting.mpl.MPLPlot.fig_alpha = 0
holoviews.plotting.mpl.ElementPlot.bgcolor = 'white'

from holoviews.operation.datashader import datashade
from colorcet import fire
datashade.cmap=fire[50:]

data_dir = "data/"

## What is Datashader?
- Makes pictures of large datasets, fast!
- Preserves distribution and outliers in the visualization
- Uses Numba and Dask for scale
- Not exactly a plotting library but plays well with HoloViews and Bokeh
- http://datashader.org

## When would I want to use it?
- When you have a LOT of data to plot, like tens of thousands or more points.
- Focus is on ensuring the *distribution* is clear

# Synthetic Example

- A bunch of points from 5 different gaussian distributions 
- 4 clusters of different sizes, and one big cluster that overlaps all of them

In [5]:
def gaussians(specs=[(1.5,0,1.0),(-1.5,0,1.0)],num=100):
    """
    A concatenated list of points taken from 2D Gaussian distributions.
    Each distribution is specified as a tuple (x,y,s), where x,y is the mean
    and s is the standard deviation.  Defaults to two horizontally
    offset unit-mean Gaussians.
    """
    np.random.seed(1)
    dists = [(np.random.normal(x,s,num), np.random.normal(y,s,num)) for x,y,s in specs]
    return np.hstack([d[0] for d in dists]), np.hstack([d[1] for d in dists])


dist = gaussians(specs=[(2,2,0.02), (2,-2,0.1), (-2,-2,0.5), (-2,2,1.0), (0,0,3)],num=10000)

In [6]:
hv.Points(dist) + hv.Points(dist)(style=dict(s=0.1)) + hv.Points(dist)(style=dict(s=0.01,alpha=0.05))

With traditional approaches to plotting, you see a very different picture based on what settings you pick

## Big Data Plotting Pitfalls
- Overplotting (Image A)
- Oversaturation
- Undersampling
- Undersaturation
- Underutilized Range
- Nonuniform colormapping

Datashader just works to plot the best image

In [4]:
%output size=200

datashade(hv.Points(dist))

Real Example:
https://github.com/pyviz/datashader/blob/5a38ed6c1f615d86a79e301279c558670fdd5058/examples/topics/gerrymandering.ipynb

-> Something with interactive Bokeh view, perhaps GIS


This is an example loading 300 million data points of US Census Data

This is based on the DataShader Gerrymandering sample, here: https://github.com/pyviz/datashader/blob/5a38ed6c1f615d86a79e301279c558670fdd5058/examples/topics/gerrymandering.ipynb

In [1]:
data_dir = "../datashader-examples/data/"

In [2]:
import holoviews as hv
import geoviews as gv
import datashader as ds
import dask.dataframe as dd
from cartopy import crs

from holoviews.operation.datashader import datashade

hv.extension('bokeh', width=95)

%opts RGB     [width=1200 height=682 xaxis=None yaxis=None show_grid=False] 
%opts Shape (fill_alpha=0 line_width=1.5) [apply_ranges=False tools=['tap']] 
%opts Points [apply_ranges=False] WMTS (alpha=0.5)

In [3]:
color_key = {'w':'blue',  'b':'green', 'a':'red',   'h':'orange',   'o':'saddlebrown'}
races     = {'w':'White', 'b':'Black', 'a':'Asian', 'h':'Hispanic', 'o':'Other'}

color_points = hv.NdOverlay({races[k]: gv.Points([0,0], crs=crs.PlateCarree())(style=dict(color=v))
                             for k, v in color_key.items()})

In [4]:
df = dd.io.parquet.read_parquet(data_dir + 'census.snappy.parq')
df = df.persist()
census_points = hv.Points(df, kdims=['easting', 'northing'], vdims=['race'])

RuntimeError: Please install either fastparquet or pyarrow

In [None]:
x_range, y_range = ((-13884029.0, -7453303.5), (2818291.5, 6335972.0)) # Continental USA
shade_defaults = dict(x_range=x_range, y_range=y_range, x_sampling=10, y_sampling=10, width=1200, height=682,
                      color_key=color_key, aggregator=ds.count_cat('race'),)
shaded = datashade(census_points, **shade_defaults)
shaded

In [None]:
shape_path = data_dir + 'cb_2015_us_cd114_5m.shp'
districts = gv.Shape.from_shapefile(shape_path, crs=crs.PlateCarree())
districts = gv.operation.project_shape(districts)

In [None]:
tiles = gv.WMTS('https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg')

In [None]:
shaded = datashade(census_points, **shade_defaults)
tiles * shaded * color_points * districts