In [None]:
from collections import deque
from functools import partial
from itertools import cycle

import pandas as pd

import datashader as ds
import datashader.transfer_functions as tf
from datashader.colors import viridis

from streamz import Stream

In [None]:
def taxi_trips_stream(source='data/nyc_taxi.csv', frequency='T'):
    """Generate dataframes grouped by given frequency"""
    def get_group(resampler, key):
        try:
            df = resampler.get_group(key)
            df.reset_index(drop=True)
        except KeyError:
            df = pd.DataFrame()
        return df

    df = pd.read_csv(source,
                     infer_datetime_format=True,
                     parse_dates=['tpep_pickup_datetime', 'tpep_pickup_datetime'])
    df = df.set_index('tpep_pickup_datetime', drop=True)
    df = df.sort_index()
    r = df.resample(frequency)
    chunks = [get_group(r, g) for g in sorted(r.groups)]
    indices = cycle(range(len(chunks)))
    while True:
        yield chunks[next(indices)]

### Create streams

Given a stream of dataframes representing NYC taxi data, we create four streams: two streams are sliding window aggregations over some time period, while two other streams track the cumulative average for a particular value. The pipeline visualization below shows each step that makes up each stream.

For each aggregation stream, the steps are 1) aggregate each dataframe using a Datashader reduction, 2) keep sliding window of those aggregations, and 3) combine sliding window collection into image. The first stream creates a two-day sliding window aggregation, while the second stream creates a 1-week sliding window aggregation.

For each cumulative average stream, we track the cumulative sum of each value along with the number of cumulative data points.

We use the primitives given in the `streamz` library to accomplish this. `aggregated_sliding_window_image_queue` creates each aggregation stream. `cumulative_mean_queue` creates each cumulative average stream, but this will likely be replaced by a native `streamz.StreamingDataFrame` container when ready. Each stream will place its final result into a double-ended queue, which is used to keep a history of previous results. By default, we only keep the most recent.

In [None]:
def aggregate_df(df, cvs, x, y, agg=None):
    return df.index.min(), df.index.max(), cvs.points(df, x, y, agg)

In [None]:
def aggregate_images(iterable, cmap):
    name = "{:.10} - {:.10}".format(str(iterable[0][0]), str(iterable[-1][1]))
    total = sum([item[2] for item in iterable])
    return tf.shade(total, cmap=cmap, name=name)

In [None]:
def aggregated_sliding_window_image_queue(source, agg1, agg2, window=1, history=1):
    q = deque(maxlen=history)
    s = source.map(agg1).sliding_window(window)
    s.map(agg2).sink(q.append)
    return q

In [None]:
def cumulative_mean_queue(source, column, history=1):
    def accumulator(acc, df):
        n, total, oldest = acc
        if not oldest:
            oldest = df.index.min()
        return n + 1, total + df[column].sum(), oldest, df.index.max()
    
    def merge(value):
        n, total, oldest, latest = value
        return oldest, latest, total / n

    q = deque(maxlen=history)
    source.accumulate(accumulator, start=(0, 0, None)).map(merge).sink(q.append)
    return q

In [None]:
def show_queue(q, column):
    pd.options.display.float_format = '{:.2f}'.format
    return pd.DataFrame(list(q), columns=['start', 'end', column])

In [None]:
x_range = (-8243204.0, -8226511.0)
y_range = (4968192.0, 4982886.0)
cvs = ds.Canvas(plot_width=800, plot_height=600, x_range=x_range, y_range=y_range)

In [None]:
# Helper functions for useful aggregations
min_amount     = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.min('total_amount'))
max_amount     = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.max('total_amount'))
mean_amount    = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.mean('total_amount'))
sum_amount     = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.sum('total_amount'))
max_passengers = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.max('passenger_count'))
sum_passengers = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.sum('passenger_count'))
sum_pickups    = partial(aggregate_df, cvs, x='pickup_x', y='pickup_y', agg=ds.count())

reduce_viridis = partial(aggregate_images, cmap=viridis)

In [None]:
source = Stream()
q_days = aggregated_sliding_window_image_queue(source, window=2, history=6, agg1=max_amount, agg2=reduce_viridis)
q_week = aggregated_sliding_window_image_queue(source, window=7, agg1=max_amount, agg2=reduce_viridis)

q_avg_passengers = cumulative_mean_queue(source, 'passenger_count', history=7)
q_avg_amount     = cumulative_mean_queue(source, 'total_amount', history=7)

In [None]:
source.visualize()

### Simplifying stream creation

As you can see in the previous section, there are a few areas to improve upon:

- less code/boilerplate
- hide individual steps seen in stream diagram
- encapsulate separate stream construction methods into helper classes
- separate stream creation and stream sink
- allow for partial results from sliding windows (not currently supported by `streamz`)
- output results into other collections besides queues

By subclassing `streamz.Stream`, we've accomplished the above without sacrificing readability.

In [None]:
class SlidingWindowImageAggregate(Stream):
    def __init__(self, source, canvas, x, y, agg, n=7, cmap=None, bgcolor='black'):
        # Set internal streamz instance variables to control names in diagram
        self.n = n
        
        def aggregate_df(df):
            return df.index.min(), df.index.max(), canvas.points(df, x, y, agg)

        def aggregate_images(iterable):
            name = "{:.10} - {:.10}".format(str(iterable[0][0]), str(iterable[-1][1]))
            total = sum([item[2] for item in iterable])
            return tf.set_background(tf.shade(total, cmap, name=name), color=bgcolor)
        
        self.cache = deque(maxlen=n)
        self.agg1 = aggregate_df
        self.agg2 = aggregate_images
        
        Stream.__init__(self, source)
        
    def update(self, x, who=None):
        self.cache.append(self.agg1(x))
        return self.emit(self.agg2(tuple(self.cache)))

In [None]:
class CumulativeMean(Stream):
    def __init__(self, source, column):
        # Set internal streamz instance variables to control names in diagram
        self.str_list = ['column']
        self.column = column

        self.count = 0
        self.total = 0
        self.oldest = None

        Stream.__init__(self, source)

    def update(self, x, who=None):
        if not self.oldest:
            self.oldest = x.index.min()
        self.count, self.total = self.count + 1, self.total + x[self.column].sum()
        return self.emit((self.oldest, x.index.max(), self.total / self.count))

In [None]:
source = Stream()

cvs = ds.Canvas(plot_width=800, plot_height=600, x_range=x_range, y_range=y_range)

q_days = deque(maxlen=6)
s_days = SlidingWindowImageAggregate(source, cvs, 'pickup_x', 'pickup_y', ds.max('total_amount'), n=2, cmap=viridis)
s_days.sink(q_days.append)

q_week = deque(maxlen=1)
s_week = SlidingWindowImageAggregate(source, cvs, 'pickup_x', 'pickup_y', ds.max('total_amount'), n=7, cmap=viridis)
s_week.sink(q_week.append)

q_avg_passengers = deque(maxlen=7)
s_avg_passengers = CumulativeMean(source, 'passenger_count')
s_avg_passengers.sink(q_avg_passengers.append)

q_avg_amount = deque(maxlen=7)
s_avg_amount = CumulativeMean(source, 'total_amount')
s_avg_amount.sink(q_avg_amount.append)

In [None]:
source.visualize()

### Push data through streams

We initially push 3 days worth of dataframes through the streams to view partial results.

In [None]:
trips_per_day = taxi_trips_stream(frequency='D')
for i in range(3):
    source.emit(next(trips_per_day))

In [None]:
tf.Images(*list(q_week))

In [None]:
for i in range(4):
    source.emit(next(trips_per_day))

In [None]:
tf.Images(*list(q_week))

#### Cumulative average of passengers (ordered by oldest first)

In [None]:
show_queue(q_avg_passengers, 'cumulative average passengers')

#### Cumulative average of total fare (ordered by oldest first)

In [None]:
show_queue(q_avg_amount, 'cumulative average total fare')

#### History of 2-day aggregations (ordered by oldest first)

In [None]:
tf.Images(*list(q_days))

#### Current 1-week aggregation

In [None]:
tf.Images(*list(q_week))

Now we get the next day's worth of data and see how the streams have updated.

In [None]:
source.emit(next(trips_per_day))

#### Cumulative average of passengers (ordered by oldest first)

In [None]:
show_queue(q_avg_passengers, 'cumulative average passengers')

#### Cumulative average of total fare (ordered by oldest first)

In [None]:
show_queue(q_avg_amount, 'cumulative average total fare')

#### History of 2-day aggregations (ordered by oldest first)

In [None]:
tf.Images(*list(q_days))

#### Current 1-week aggregation

In [None]:
tf.Images(*list(q_week))