Install instructions:
```
conda install numpy pandas xarray dask colorcet datashader streamz
```

It is assumed the NYC taxi data from the notebook examples is available.

In [None]:
from collections import deque
from functools import partial
from itertools import cycle
import datetime

import numpy as np
import pandas as pd
import xarray as xr

import dask.dataframe as dd
import datashader as ds
import datashader.transfer_functions as tf
from datashader.colors import viridis

from colorcet import fire
from streamz import Stream

In [None]:
def taxi_trips_stream(source='data/nyc_taxi.csv', frequency='T'):
    """Generate dataframes grouped by given frequency"""
    def get_group(resampler, key):
        try:
            df = resampler.get_group(key)
            df.reset_index(drop=True)
        except KeyError:
            df = pd.DataFrame()
        return df

    df = pd.read_csv(source,
                     infer_datetime_format=True,
                     parse_dates=['tpep_pickup_datetime', 'tpep_pickup_datetime'])
    df = df.set_index('tpep_pickup_datetime', drop=True)
    df = df.sort_index()
    r = df.resample(frequency)
    chunks = [get_group(r, g) for g in sorted(r.groups)]
    indices = cycle(range(len(chunks)))
    while True:
        yield chunks[next(indices)]

### Create streaming pipeline

Given a stream of dataframes representing NYC taxi data, we create a pipeline with two streams. For each stream, the general steps are 1) aggregate each dataframe using Datashader reduction, 2) keep sliding window of aggregations, and 3) combine sliding window collection into image. The first stream creates a two-day sliding window aggregation, while the second stream creates a 1-week sliding window aggregation. The pipeline visualization below shows each step that makes up each stream.

We use the primitives given in the `streamz` library to accomplish this. `aggregated_sliding_window_image_queue` creates each distinct pipeline, but this will likely be supplanted by a native `streamz.StreamingDataFrame` container when ready. Each stream will place its final aggregation into a double-ended queue, which is used to keep a history of previous aggregations. By default, we only keep the most recent.

In [None]:
def fork_stream(source):
    stream = Stream()
    source.connect(stream)
    return stream

In [None]:
def aggregate_df(df, x, y, plot_width=800, plot_height=600, agg=None):
    cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height)
    return cvs.points(df, x, y, agg)

def aggregate_images(iterable, cmap):
    merged = xr.concat(iterable, dim='cols')
    total = merged.sum(dim='cols')
    return tf.shade(total, cmap=cmap)

In [None]:
def aggregated_sliding_window_image_queue(source, agg1, agg2, window=1, history=1):
    q = deque(maxlen=history)
    s = fork_stream(source).map(agg1).sliding_window(window)
    s.map(agg2).sink(q.append)
    return q

In [None]:
# Helper functions for useful aggregations
min_amount     = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.min('total_amount'))
max_amount     = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.max('total_amount'))
mean_amount    = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.mean('total_amount'))
sum_amount     = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.sum('total_amount'))
max_passengers = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.max('passenger_count'))
sum_passengers = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.sum('passenger_count'))
sum_pickups    = partial(aggregate_df, x='pickup_x', y='pickup_y', agg=ds.count())

reduce_fire = partial(aggregate_images, cmap=fire)
reduce_viridis = partial(aggregate_images, cmap=viridis)

In [None]:
source = Stream()
q_days = aggregated_sliding_window_image_queue(source, window=2, history=10, agg1=max_amount, agg2=reduce_viridis)
q_week = aggregated_sliding_window_image_queue(source, window=7, agg1=max_amount, agg2=reduce_viridis)

In [None]:
source.visualize()

### Push data through pipeline

We initially push 7 days worth of dataframes through the pipeline since the sliding window requires a full window before emitting a window's worth of data.

In [None]:
trips_per_day = taxi_trips_stream(frequency='D')
for i in range(7):
    source.emit(next(trips_per_day))

In [None]:
q_days[-1]  # most recent 2-day aggregation

In [None]:
q_week[-1]  # most recent 1-week aggregation

In [None]:
source.emit(next(trips_per_day))

In [None]:
q_days[-1]

In [None]:
q_week[-1]