# Setup

- Install Pandas, Bokeh (for visualizations) and Kaskada.
- Configure Bokeh to render visualizations inline.
- Configure auto-reloading
- Load timeline visualization function.
- Initialize the Kaskada session.

In [None]:
%pip install pandas bokeh kaskada selenium

In [None]:
# Configure reloading of loaded modules (useful for the visualization function)
%reload_ext autoreload
%autoreload 2

In [None]:
# Import Pandas
import pandas as pd

# Configure bokeh for local output.
from bokeh.io import output_notebook, show, export_svg, save
from bokeh.resources import CDN
output_notebook()

# Import visualization helpers
from visualize import *

In [None]:
# Create the Kaskada Session
from kaskada.api.session import LocalBuilder

# Work around https://github.com/kaskada-ai/kaskada/issues/267
session = None
import gc
gc.collect()

session = LocalBuilder().build()

In [None]:
%reload_ext fenlmagic

# Sample Data

In [None]:
Purchases = pd.DataFrame.from_dict([
    # { 'time': '2022-10-27', 'user': 'Ben',   'amount': 5,  'item': 'widget' },
    # { 'time': '2022-10-27', 'user': 'Davor', 'amount': 8,  'item': 'gear'   },
    { 'time': '2022-11-02', 'user': 'Ben',   'amount': 11, 'item': 'widget' },
    { 'time': '2022-11-15', 'user': 'Davor', 'amount': 3,  'item': 'widget' },
    { 'time': '2022-11-27', 'user': 'Davor', 'amount': 4,  'item': 'gear' },
    { 'time': '2022-12-12', 'user': 'Ben',   'amount': 18, 'item': 'gear' },
    { 'time': '2023-01-01', 'user': 'Ben',   'amount': 12, 'item': 'widget' },
    { 'time': '2023-01-02', 'user': 'Ben',   'amount': 13, 'item': 'widget' },
    { 'time': '2023-01-09', 'user': 'Davor', 'amount': 6,  'item': 'widget' },
    # { 'time': '2023-02-08', 'user': 'Ben',   'amount': 7,  'item': 'gear' },
    # { 'time': '2023-02-28', 'user': 'Davor', 'amount': 9,  'item': 'gear' },
    # { 'time': '2023-03-03', 'user': 'Ben',   'amount': 3,  'item': 'gear' },
    # { 'time': '2023-03-06', 'user': 'Davor', 'amount': 8,  'item': 'widget' },
])

# Convert string times to date times
# Purchases['time'] = pd.to_datetime(Purchases['time'])
Purchases

In [None]:
PageViews = pd.DataFrame.from_dict([
    # { 'time': '2022-10-25', 'user': 'Davor', 'item': 'gear'   },
    # { 'time': '2022-10-26', 'user': 'Ben',   'item': 'widget' },
    # { 'time': '2022-10-28', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2022-11-01', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2022-11-01', 'user': 'Davor', 'item': 'widget' },
    { 'time': '2022-11-24', 'user': 'Davor', 'item': 'gear' },
    { 'time': '2022-11-25', 'user': 'Davor', 'item': 'gear' },
    { 'time': '2022-11-26', 'user': 'Davor', 'item': 'gear' },
    { 'time': '2022-12-10', 'user': 'Ben',   'item': 'gear' },
    { 'time': '2023-12-31', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2023-01-01', 'user': 'Davor', 'item': 'widget' },
    # { 'time': '2023-02-07', 'user': 'Ben',   'item': 'gear' },
    # { 'time': '2023-02-26', 'user': 'Davor', 'item': 'gear' },
    # { 'time': '2023-03-02', 'user': 'Ben',   'item': 'gear' },
    # { 'time': '2023-03-04', 'user': 'Davor', 'item': 'widget' },
])
# Convert string times to date times
# PageViews['time'] = pd.to_datetime(PageViews['time'])
PageViews

In [None]:
Ratings = pd.DataFrame.from_dict([
    # { 'time': '2022-09-25', 'user': 'Joe',   'item': 'gear',   'score': 5 },
    # { 'time': '2022-09-22', 'user': 'Ben',   'item': 'widget', 'score': 5 },
    # { 'time': '2022-10-25', 'user': 'Davor', 'item': 'gear',   'score': 2 },
    { 'time': '2022-11-25', 'user': 'Ben',   'item': 'gear',   'score': 3 },
    { 'time': '2022-11-28', 'user': 'Ben',   'item': 'gear',   'score': 2 },
    { 'time': '2022-12-27', 'user': 'Davor', 'item': 'gear',   'score': 5 },
    { 'time': '2022-12-30', 'user': 'Joe',   'item': 'widget', 'score': 1 },
    { 'time': '2023-01-25', 'user': 'Joe',   'item': 'gear',   'score': 5 },
])
# Convert string times to date times
# Ratings['time'] = pd.to_datetime(Ratings['time'])
Ratings

In [None]:
from kaskada import table

try:
    view.delete_view('ReviewsByItem')
except:
    pass

try:
  table.delete_table('Purchases')
except:
  pass

table.create_table(
  table_name = 'Purchases',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'Purchases',
  dataframe = Purchases,
)

try:
  table.delete_table('PageViews')
except:
  pass

table.create_table(
  table_name = 'PageViews',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'PageViews',
  dataframe = Purchases,
)

try:
  table.delete_table('Ratings')
except:
  pass

table.create_table(
  table_name = 'Ratings',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'Ratings',
  dataframe = Ratings,
)

# 1 - Aggregation
How much did each users’ spending change over time?

In [None]:
%%fenl --var=aggregate
{ 
    amount: Purchases.amount,
    sum_amount: sum(Purchases.amount)
}

In [None]:
plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchases.amount'),
    Timeline('sum_amount', label = 'sum(Purchases.amount)', continuous = True)
], data = aggregate.dataframe)

# Show the plot
show(plot)

# Save the plot as HTML (for the blog posts)
save(plot, filename = 'aggregation.html', resources = CDN, title = 'Aggregation')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot, filename = 'aggregation.svg')

In [None]:
# Create separate discrete and continuous plots with both entities.
discrete_plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchases.amount'),
], data = aggregate.dataframe)

continuous_plot = plot_timelines(timelines = [
    Timeline('sum_amount', label = 'sum(Purchases.amount)', continuous = True)
], data = aggregate.dataframe)

# Save the plot as SVG (for conversion to PNG in slides).
show(discrete_plot)
export_svg(discrete_plot, filename = 'discrete.svg')
show(continuous_plot)
export_svg(continuous_plot, filename = 'continuous.svg')

In [None]:
# Create separate discrete and continuous plots for just Ben.
df = aggregate.dataframe
just_ben = df[df['_key'] == 'Ben']

discrete_plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchases.amount'),
], data = just_ben)

continuous_plot = plot_timelines(timelines = [
    Timeline('sum_amount', label = 'sum(Purchases.amount)', continuous = True)
], data = just_ben)

# Save the plot as SVG (for conversion to PNG in slides).
show(discrete_plot)
export_svg(discrete_plot, filename = 'discrete_single.svg')
show(continuous_plot)
export_svg(continuous_plot, filename = 'continuous_single.svg')

# 2 - Windowed Aggregation
How much does each user’s spending change within each month?

In [None]:
%%fenl --var=windowed_aggregate
{ 
    amount: Purchases.amount,
    sum_amount: sum(Purchases.amount, window=since(monthly()))
}

In [None]:
# TODO: Add vertical lines for the month boundaries?
plot = plot_timelines(data = windowed_aggregate.dataframe, timelines = [
    Timeline('amount', label = 'Purchases.amount'),
    Timeline('sum_amount', label = 'sum(Purchases.amount, window=since(monthly()))', continuous = True)
])

# Show the plot
show(plot)

# Save the plot as HTML (for the blog posts)
save(plot, filename = 'windowed.html', resources = CDN, title = 'Windowed Aggregation')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot, filename = 'windowed.svg')

# 3 - Data-Defined Windowed Aggregation
For each user, what are the average number of page-views between each purchase?

In [None]:
%%fenl --var=data_windowed_aggregate
let page_views_since_purchase = count(PageViews, window=since(is_valid(Purchases)))
in {
  page_views_since_purchase,
  filtered: page_views_since_purchase | when(is_valid(Purchases)),
  result:  page_views_since_purchase | when(is_valid(Purchases)) | mean()
}

In [None]:
%%fenl --var=purchases
Purchases

In [None]:
%%fenl --var=page_views
PageViews

In [None]:
plot1 = plot_timelines([
    Timeline('item', data = page_views.dataframe, label = 'PageViews.item'),
    Timeline('item', data = purchases.dataframe, label = 'Purchases.item'),
    Timeline('page_views_since_purchase', label = 'count(PageViews, window=since(is_valid(Purchases)))',
      continuous = True),
], data = data_windowed_aggregate.dataframe)

# Show the plot
show(plot1)

# Save the plot as HTML (for the blog posts)
save(plot1, filename = 'data_windows_1.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot1, filename = 'data_windows_1.svg')

plot2 = plot_timelines([
    Timeline('page_views_since_purchase', label = 'count(PageViews, window=since(is_valid(Purchases)))',
      continuous = True),
    Timeline('item', data = purchases.dataframe, label = 'Purchases.item'),
    Timeline('filtered', label = '... when(is_valid(Purchases))')
], data = data_windowed_aggregate.dataframe)

# Show the plot
show(plot2)

# Save the plot as HTML (for the blog posts)
save(plot2, filename = 'data_windows_2.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot2, filename = 'data_windows_2.svg')

plot3 = plot_timelines([
    Timeline('filtered', label = 'PageViews between purchases'),
    Timeline('result', label = 'Average PageViews between purchases', continuous=True),
], data = data_windowed_aggregate.dataframe,)

# Show the plot
show(plot3)

# Save the plot as HTML (for the blog posts)
save(plot3, filename = 'data_windows_3.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot3, filename = 'data_windows_3.svg')

# 4 - Temporal Join
What is the average product review (score) at time of purchase?


In [None]:
%%fenl --var=reviews_by_item
let reviews_by_item = Ratings | with_key($input.item)
in { score: reviews_by_item.score,
     average_score: reviews_by_item.score | mean() }

In [None]:
from kaskada import view

try:
    view.delete_view('ReviewsByItem')
except:
    pass

view.create_view(
    view_name = 'ReviewsByItem',
    expression = reviews_by_item.expression,
)

In [None]:
%%fenl --var=temporal_join
{
  item: Purchases.item,
  average_review: lookup(Purchases.item, ReviewsByItem.average_score)
}

In [None]:
plot1 = plot_timelines([
    Timeline('score', data = reviews_by_item.dataframe, label = 'ReviewsByItem.score'),
    Timeline('average_score', data = reviews_by_item.dataframe, label = 'ReviewsByItem.average_score', continuous=True),
], data = temporal_join.dataframe)

# Show the plot
show(plot1)

# Save the plot as HTML (for the blog posts)
save(plot1, filename = 'temporal_join_1.html', resources = CDN, title = 'Temporal Join')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot1, filename = 'temporal_join_1.svg')

plot2 = plot_timelines([
    Timeline('average_score', data = reviews_by_item.dataframe, label = 'ReviewsByItem.average_score', continuous=True),
    Timeline('item', label='Purchases.item'),
    Timeline('average_review', label = 'lookup(Purchases.item, ReviewsByItem.average_score)'),
], data = temporal_join.dataframe)

# Show the plot
show(plot2)

# Save the plot as HTML (for the blog posts)
save(plot2, filename = 'temporal_join_2.html', resources = CDN, title = 'Temporal Join')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot2, filename = 'temporal_join_2.svg')