# Setup

- Install Pandas, Bokeh (for visualizations) and Kaskada.
- Configure Bokeh to render visualizations inline.
- Configure auto-reloading
- Load timeline visualization function.
- Initialize the Kaskada session.

In [None]:
%pip install pandas bokeh kaskada selenium

In [None]:
# Configure reloading of loaded modules (useful for the visualization function)
%reload_ext autoreload
%autoreload 2

In [None]:
# Import Pandas
import pandas as pd

# Configure bokeh for local output.
from bokeh.io import output_notebook, show, export_svg, save
from bokeh.resources import CDN
output_notebook()

# Import visualization helpers
from visualize import *

In [None]:
# Create the Kaskada Session
from kaskada.api.session import LocalBuilder

# Work around https://github.com/kaskada-ai/kaskada/issues/267
session = None
import gc
gc.collect()

session = LocalBuilder().build()

In [None]:
%reload_ext fenlmagic

# Sample Data

In [None]:
Purchases = pd.DataFrame.from_dict([
    { 'time': '2022-11-02', 'user': 'Ben',   'amount': 11, 'item': 'widget' },
    { 'time': '2022-11-15', 'user': 'Davor', 'amount': 3,  'item': 'widget' },
    { 'time': '2022-11-27', 'user': 'Davor', 'amount': 4,  'item': 'gear' },
    { 'time': '2022-12-12', 'user': 'Ben',   'amount': 18, 'item': 'gear' },
    { 'time': '2023-01-01', 'user': 'Ben',   'amount': 12, 'item': 'widget' },
    { 'time': '2023-01-02', 'user': 'Ben',   'amount': 13, 'item': 'widget' },
    { 'time': '2023-01-09', 'user': 'Davor', 'amount': 6,  'item': 'widget' },
])
Purchases

In [None]:
PageViews = pd.DataFrame.from_dict([
    { 'time': '2022-11-18', 'user': 'Davor', 'item': 'widget' },
    { 'time': '2022-11-21', 'user': 'Davor', 'item': 'gear' },
    { 'time': '2022-11-24', 'user': 'Davor', 'item': 'widget' },
    { 'time': '2022-12-02', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2022-12-06', 'user': 'Davor', 'item': 'widget' },
    { 'time': '2022-12-10', 'user': 'Davor', 'item': 'gear' },
    { 'time': '2022-12-20', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2022-12-25', 'user': 'Ben',   'item': 'gear' },
    { 'time': '2022-12-30', 'user': 'Ben',   'item': 'widget' },
    { 'time': '2022-12-30', 'user': 'Davor', 'item': 'gear' },
])
PageViews

In [None]:
Ratings = pd.DataFrame.from_dict([
    { 'time': '2022-11-25', 'user': 'Ben',   'item': 'gear',   'score': 3 },
    { 'time': '2022-11-28', 'user': 'Ben',   'item': 'gear',   'score': 2 },
    { 'time': '2022-12-27', 'user': 'Davor', 'item': 'gear',   'score': 5 },
    { 'time': '2022-12-30', 'user': 'Joe',   'item': 'widget', 'score': 1 },
    { 'time': '2023-01-25', 'user': 'Joe',   'item': 'gear',   'score': 5 },
])
Ratings

In [None]:
from kaskada import table

try:
    view.delete_view('ReviewsByItem')
except:
    pass

try:
  table.delete_table('Purchases')
except:
  pass

table.create_table(
  table_name = 'Purchases',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'Purchases',
  dataframe = Purchases,
)

try:
  table.delete_table('PageViews')
except:
  pass

table.create_table(
  table_name = 'PageViews',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'PageViews',
  dataframe = PageViews,
)

try:
  table.delete_table('Ratings')
except:
  pass

table.create_table(
  table_name = 'Ratings',
  time_column_name = 'time',
  entity_key_column_name = 'user',
  grouping_id = 'user',
)

table.load_dataframe(
  table_name = 'Ratings',
  dataframe = Ratings,
)

# 1 - Aggregation
How much did each users’ spending change over time?

In [None]:
%%fenl --var=aggregate
{ 
    amount: Purchases.amount,
    sum_amount: sum(Purchases.amount)
}

In [None]:
plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchase amounts'),
    Timeline('sum_amount', label = 'Sum of purchase amounts', continuous = True)
], data = aggregate.dataframe)

# Show the plot
show(plot)

# Save the plot as HTML (for the blog posts)
# save(plot, filename = 'aggregation.html', resources = CDN, title = 'Aggregation')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot, filename = 'aggregation.svg')

In [None]:
# Create separate discrete and continuous plots with both entities.
discrete_plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchase amounts'),
], data = aggregate.dataframe)

continuous_plot = plot_timelines(timelines = [
    Timeline('sum_amount', label = 'Sum of purchase amounts', continuous = True)
], data = aggregate.dataframe)

# Save the plot as SVG (for conversion to PNG in slides).
show(discrete_plot)
export_svg(discrete_plot, filename = 'discrete.svg')
show(continuous_plot)
export_svg(continuous_plot, filename = 'continuous.svg')

In [None]:
# Create separate discrete and continuous plots for just Ben.
df = aggregate.dataframe
just_ben = df[df['_key'] == 'Ben']

discrete_plot = plot_timelines(timelines = [
    Timeline('amount', label = 'Purchase amounts'),
], data = just_ben)

continuous_plot = plot_timelines(timelines = [
    Timeline('sum_amount', label = 'Sum of purchase amounts', continuous = True)
], data = just_ben)

# Save the plot as SVG (for conversion to PNG in slides).
show(discrete_plot)
export_svg(discrete_plot, filename = 'discrete_single.svg')
show(continuous_plot)
export_svg(continuous_plot, filename = 'continuous_single.svg')

In [None]:
df = aggregate.dataframe
history = pd.DataFrame({ 'user': df['_key'], 'time': pd.to_datetime(df['_time']), 'value': df['sum_amount']})
history

# History
```
{ user: 'Ben',   time: '2022-11-02', value: 11 }
{ user: 'Davor', time: '2022-11-15', value: 3 }
{ user: 'Davor', time: '2022-11-27', value: 7 }
{ user: 'Ben',   time: '2022-12-12', value: 29 }
{ user: 'Ben',   time: '2023-01-01', value: 41 }
{ user: 'Ben',   time: '2023-01-02', value: 54 }
{ user: 'Davor', time: '2023-01-09', value: 13 }
```

# 2 - Windowed Aggregation
How much does each user’s spending change within each month?

In [None]:
%%fenl --var=windowed_aggregate
{ 
    amount: Purchases.amount,
    sum_amount: sum(Purchases.amount, window=since(monthly()))
        # Hack to work-around https://github.com/kaskada-ai/kaskada/issues/297
        | if({ tick: monthly(), input: is_valid($input)} | not($input.tick | else(false)))
}

In [None]:
# TODO: Add vertical lines for the month boundaries?
plot = plot_timelines(data = windowed_aggregate.dataframe, timelines = [
    Timeline('amount', label = 'Purchase amounts'),
    Timeline('sum_amount', label = 'Sum of purchase amounts since start of month', continuous = True)
])

# Show the plot
show(plot)

# Save the plot as HTML (for the blog posts)
# save(plot, filename = 'windowed.html', resources = CDN, title = 'Windowed Aggregation')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot, filename = 'windowed.svg')

# 3 - Data-Defined Windowed Aggregation
For each user, what are the average number of page-views between each purchase?

In [None]:
%%fenl --var=page_views_since_purchase
let page_views_since_purchase_raw = count(PageViews, window=since(is_valid(Purchases)))
let page_views_since_purchase = page_views_since_purchase_raw 
      # Hack to work-around https://github.com/kaskada-ai/kaskada/issues/297
      | if({ predicate: is_valid(Purchases), input: is_valid($input)} | not($input.predicate | else(false)))
      | else(0)
in {
  page_views_since_purchase,
  raw: page_views_since_purchase_raw
}

In [None]:
%%fenl --var=data_windowed_aggregate
let page_views_since_purchase_raw = count(PageViews, window=since(is_valid(Purchases)))
  | when(is_valid(Purchases))
in {
  filtered: page_views_since_purchase_raw,
  result:  page_views_since_purchase_raw | mean()
}

In [None]:
%%fenl --var=lag_aggregate
let page_views_at_purchase = count(PageViews) | when(is_valid(Purchases))
in {
    page_views_since_last_purchase: page_views_at_purchase - lag(1, page_views_at_purchase)
}

In [None]:
%%fenl --var=purchases
Purchases

In [None]:
%%fenl --var=page_views
PageViews

In [None]:
plot1 = plot_timelines([
    Timeline('item', data = page_views.dataframe, label = 'Page views'),
    Timeline('item', data = purchases.dataframe, label = 'Purchases'),
    Timeline('page_views_since_purchase',
      data = page_views_since_purchase.dataframe,
      label = 'Count of page views since last purchase',
      continuous = True),
], data = data_windowed_aggregate.dataframe)

# Show the plot
show(plot1)

# Save the plot as HTML (for the blog posts)
# save(plot1, filename = 'data_windows_1.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot1, filename = 'data_windows_1.svg')

plot2 = plot_timelines([
    Timeline('page_views_since_purchase', label = 'Count of page views since last purchase',
      data = page_views_since_purchase.dataframe,
      continuous = True),
    Timeline('item', data = purchases.dataframe, label = 'Purchases'),
    Timeline('filtered', label = 'Count of page views between purchases')
], data = data_windowed_aggregate.dataframe)

# Show the plot
show(plot2)

# Save the plot as HTML (for the blog posts)
# save(plot2, filename = 'data_windows_2.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot2, filename = 'data_windows_2.svg')

plot3 = plot_timelines([
    Timeline('filtered', label = 'Count of page views between purchases'),
    Timeline('result', label = 'Average count of page views between purchases', continuous=True),
], data = data_windowed_aggregate.dataframe,)

# Show the plot
show(plot3)

# Save the plot as HTML (for the blog posts)
# save(plot3, filename = 'data_windows_3.html', resources = CDN, title = 'Data Defined Windows')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot3, filename = 'data_windows_3.svg')

# 4 - Temporal Join
What is the average product review (score) at time of purchase?


In [None]:
%%fenl --var=reviews_by_item
let reviews_by_item = Ratings | with_key($input.item)
in { score: reviews_by_item.score,
     average_score: reviews_by_item.score | mean() }

In [None]:
from kaskada import view

try:
    view.delete_view('ReviewsByItem')
except:
    pass

view.create_view(
    view_name = 'ReviewsByItem',
    expression = reviews_by_item.expression,
)

In [None]:
%%fenl --var=temporal_join
{
  item: Purchases.item,
  average_review: lookup(Purchases.item, ReviewsByItem.average_score)
}

In [None]:
plot1 = plot_timelines([
    Timeline('score', data = reviews_by_item.dataframe, label = 'Review scores by item',
      shift_palette = 2),
    Timeline('average_score', data = reviews_by_item.dataframe, label = 'Average review scores by item', continuous=True,
      shift_palette = 2),
], data = temporal_join.dataframe)

# Show the plot
show(plot1)

# Save the plot as HTML (for the blog posts)
# save(plot1, filename = 'temporal_join_1.html', resources = CDN, title = 'Temporal Join')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot1, filename = 'temporal_join_1.svg')

plot2 = plot_timelines([
    Timeline('average_score', data = reviews_by_item.dataframe, label = 'Average review scores by item', continuous=True),
    Timeline('item', label='Purchased item'),
    Timeline('average_review', label = 'Average review score for purchased item)'),
], data = temporal_join.dataframe)

# Show the plot
show(plot2)

# Save the plot as HTML (for the blog posts)
# save(plot2, filename = 'temporal_join_2.html', resources = CDN, title = 'Temporal Join')

# Save the plot as SVG (for conversion to PNG in slides).
export_svg(plot2, filename = 'temporal_join_2.svg')