In [1]:
import pandas as pd
import altair as alt
from IPython.display import display

In [2]:
sales_joined = pd.read_parquet("./data/m5-forecasting-accuracy/sales-joined.parq.brotli")
# if your system has less RAM, the alternate download is just one year of data
# sales_joined = pd.read_parquet("./data/m5-forecasting-accuracy/sales-joined-2015.parq.brotli")

In [3]:
# fyi resource usage
mem_gb = sales_joined.memory_usage().sum() / 1024 ** 3
print(f"{len(sales_joined) / 1e6:.1f} million rows using {mem_gb:.2f} GB of RAM")

59.2 million rows using 6.17 GB of RAM


## Bar Charts

In [4]:
total_sales_by_store = sales_joined.groupby("store_id")["count"].sum().reset_index()
total_sales_by_store

Unnamed: 0,store_id,count
0,CA_1,7832248
1,CA_2,5818395
2,CA_3,11363540
3,CA_4,4182534
4,TX_1,5692823
5,TX_2,7329642
6,TX_3,6205940
7,WI_1,5261506
8,WI_2,6697988
9,WI_3,6542557


In [5]:
alt.Chart(total_sales_by_store).mark_bar().encode(
    x='store_id',
    y='count'
)

In [6]:
bar_chart = alt.Chart(
    total_sales_by_store,
    title="Total Sales (by store)", # we want to add a title
).mark_bar( # we want to plot a bar chart
    tooltip=True, # altair supports interactivity
).encode(
    alt.X('store_id', title='Store ID'), # label the X axis
    alt.Y('count', title='Total Number of Items Sold'), # label the Y axis
).properties( # we can set the size / dimensions here
    width=360,
    height=150,
)
bar_chart # the chart must be the result of the last line or it will not display

In [8]:
display(
    # the real magic of Altair is the interactivity
    bar_chart.interactive() # default interaction is panning & zooming
); # you can use IPython.display.display to force display of a non-returned chart

In [9]:
# note: bar charts are great for plotting frequency too
item_info = sales_joined[["item_id", "dept_id", "cat_id"]].drop_duplicates()
category_counts = item_info["cat_id"].value_counts().reset_index()
category_counts.columns = ["category", "count"]

display(category_counts)
alt.Chart(
    category_counts, title="Items per Category"
).mark_bar(tooltip=True).encode(
    # flipping x & y give a horizontal bar chart
    y="category",
    x="count",
).properties(
    width=360,
    height=150,
)


## Histograms

In [None]:
# simple histogram call...
alt.Chart(sales_joined, title="Distribution of sell_price").mark_bar().encode(
    alt.X("sell_price:Q", bin=True),
    y='count()',
)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

In [None]:
# for large datasets, you need to do this:
#   alt.data_transformers.disable_max_rows()
# see the docs here: https://altair-viz.github.io/user_guide/large_datasets.html
# basically, the whole dataset is embedded in the visualization,
#  which gives you great interactivity
#  but can be inefficient
# we'll try to avoid that here

In [None]:
# histogram call: select only the data you want, and sample it
hist = alt.Chart(
    sales_joined[['sell_price']].sample(5_000), title="Distribution of sell_price",
).mark_bar(tooltip=True).encode(
    alt.X("sell_price:Q", bin=True),
    y='count()',
)
hist

In [None]:
# increase the number of bins for a more fine-grained plot
alt.Chart(
    sales_joined[['sell_price']].sample(5_000), title="Distribution of sell_price",
).mark_bar(tooltip=True).encode(
    alt.X("sell_price:Q", bin=alt.Bin(maxbins=100)),
    y='count()',
)

## Scatterplots

In [None]:
sales_by_store_by_month = sales_joined.groupby(["store_id", "month"])["count"].sum().reset_index()
sales_by_store_by_month["month_date"] = pd.to_datetime(sales_by_store_by_month["month"].astype(str), format="%Y%m")
sales_by_store_by_month.drop(columns=["month"], inplace=True)
sales_by_store_by_month["month_prev"] = sales_by_store_by_month["month_date"] - pd.DateOffset(months=1)
sales_by_store_by_month = pd.merge(
    sales_by_store_by_month, sales_by_store_by_month,
    left_on=["store_id", "month_prev"], right_on=["store_id", "month_date"], suffixes=('', '_prev'),
)

In [None]:
display(sales_by_store_by_month.head())

alt.Chart(sales_by_store_by_month, title="N Sales (curr vs prev month)").mark_circle(size=60).encode(
    x='count_prev',
    y='count',
    color='store_id',
    tooltip=['store_id', 'count', 'count_prev', 'month_date'],
).interactive()

Unnamed: 0,store_id,count,month_date,month_prev,count_prev,month_date_prev,month_prev_prev
0,CA_1,89434,2011-02-01,2011-01-01,11308,2011-01-01,2010-12-01
1,CA_1,93468,2011-03-01,2011-02-01,89434,2011-02-01,2011-01-01
2,CA_1,91553,2011-04-01,2011-03-01,93468,2011-03-01,2011-02-01
3,CA_1,87988,2011-05-01,2011-04-01,91553,2011-04-01,2011-03-01
4,CA_1,90376,2011-06-01,2011-05-01,87988,2011-05-01,2011-04-01


## Timeseries Plot

In [None]:
timeseries_chart = alt.Chart(sales_by_store_by_month, title="N Sales over time").mark_line(tooltip=True).encode(
    x='month_date',
    y='count',
    color='store_id',
)
timeseries_chart

## Multiple Charts

In [None]:
# multiple calls to "display" creates multiple charts
display(bar_chart)
display(hist)

In [None]:
# horizontal concat
display(bar_chart | hist)

In [None]:
# vertical concat
display(bar_chart & hist)

## See Also
 - [Altair Examples Gallery](https://altair-viz.github.io/gallery/index.html)