In [1]:
import os

import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import polars.selectors as cs
import tensorflow as tf
from plotly.subplots import make_subplots

print("TensorFlow version:", tf.version.VERSION)


plotly.offline.init_notebook_mode(connected=True)
plotly.io.templates.default = "plotly_dark"

2024-10-18 15:39:31.782930: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.16.2


In [2]:
# TODO remove

# # relaod a module (top-level only)
# import importlib
# import test

# importlib.reload(test)
# test.func() # updated

In [3]:
INPUT_DIR = os.path.abspath("input")  # directory with input data

TARGET_STEPS = 16  # length of the target sequences

# Read and prepare the data

In [36]:
# NOTE We ignore the 'holidays_events.csv' dataframe

In [54]:
# NOTE The training dataframe contains 1684 unique dates
# NOTE from 2013-01-01 to 2017-08-31, missing all four Dec. 25.

df = pl.read_csv(os.path.join(INPUT_DIR, "train.csv"), try_parse_dates=True)
df = df.drop("id")  # the id does nothing for us

# read the target dataframe and add the missing 'sales' column
target = (
    pl.read_csv(os.path.join(INPUT_DIR, "test.csv"), try_parse_dates=True)
    .with_columns(sales=pl.lit(None))  # column with null values
    .select(df.columns)  # reorder the columns to match the training dataframe
)

# append the target to the training dataframe
df = pl.concat([df, target])

# encode the 'family' as integers in an integer column 'family_nbr'
# maintain the order (alphabetical) to have the 'family_nbr' follow the same order
df = df.with_columns(
    pl.col("family")
    .cast(pl.Enum(df["family"].unique(maintain_order=True)))
    .to_physical()  # cast Enum values to u32 (start = 0)
    .alias("family_nbr")
)

## Add shared covariates from input files

In [55]:
# NOTE Add the oil prices to the dataframe
# NOTE We have to oil prices for the target time-steps

oil = pl.read_csv(os.path.join(INPUT_DIR, "oil.csv"), try_parse_dates=True)
oil = oil.set_sorted("date").upsample("date", every="1d")  # add missing dates
oil = oil.interpolate().fill_null(strategy="backward")  # fill missing values

df = df.join(oil, on="date")

In [56]:
# NOTE Include the store type and the cluster from 'stores.csv'

stores = (
    pl.read_csv(os.path.join(INPUT_DIR, "stores.csv"))
    .select("store_nbr", "type", "cluster")
    .rename({"type": "store_type", "cluster": "store_cluster"})
)

# encode the store type as integers
stores = stores.with_columns(
    pl.col("store_type")
    .cast(pl.Enum(stores["store_type"].unique(maintain_order=True)))
    .to_physical()
)

df = df.join(stores, on="store_nbr")

In [57]:
# NOTE Include the daily transactions per store from 'transactions.csv'
# NOTE This file is missing about 250k values (of about 3mil), which we will need to fill.
# NOTE It is questionable whether a column with this many missing values should be included.


# add the 'transactions' column to the dataframe
df = df.join(
    pl.read_csv(os.path.join(INPUT_DIR, "transactions.csv"), try_parse_dates=True),
    on=["date", "store_nbr"],
    how="full",
    coalesce=True,
)

# fill missing values by interpolation (when possible) or backwards (not filling the target)
df = df.with_columns(pl.col("transactions").interpolate().backward_fill())

## Time signals

In [62]:
# NOTE We compute the DFT of the total sales to get an idea of what time signals are important.
# NOTE We do not consider the periods greater than one year, due to the low amount of years in the dataset.

In [63]:
# aggregate the 'sales' values to obtain the total sales per day
total_sales = (
    df.select("date", "sales")
    .group_by("date", maintain_order=True)
    .sum()
    .head(-TARGET_STEPS)
)

# compute the DFT of the sales and the corresponding frequencies
dft_df = (
    pl.DataFrame({"DFT": tf.abs(tf.signal.rfft(total_sales["sales"])).numpy()})
    .with_row_index()  # add index column
    .with_columns(  # scale the index to have 1 = one year
        ((pl.col("index") * 365.25) / len(total_sales)).alias("freq")
    )
    .filter(pl.col("freq") >= 1.0)  # drop frequencies corresponding over 1 year
)

px.line(dft_df, x="freq", y="DFT").update_layout(
    height=350,
    xaxis=dict(title="inverse frequency (year)"),
    yaxis=dict(visible=False),
    title="Discrete Fourier Transform of total sales",
)

In [64]:
# NOTE We see three spikes, in order of magnitude:
# NOTE - 1/52 -> period of one week
# NOTE - 1/104 -> period of a half-week
# NOTE - 1/24 -> period of one month
# NOTE - 1 -> period of one year

In [65]:
# compute total sales per day of week and day of month
wd_sales = total_sales.group_by(pl.col("date").dt.weekday()).sum().sort(by="date")
md_sales = total_sales.group_by(pl.col("date").dt.day()).sum().sort(by="date")

# plot the results as bar graphs
fig = make_subplots(cols=2, subplot_titles=["day of week", "day of month"])
fig.add_trace(go.Bar(x=wd_sales["date"], y=wd_sales["sales"]), row=1, col=1)
fig.add_trace(go.Bar(x=md_sales["date"], y=md_sales["sales"]), row=1, col=2)
fig.update_layout(
    height=350,
    showlegend=False,
    title="Total sales per day of week and day of month",
).update_xaxes(visible=False).update_yaxes(visible=False)

In [66]:
# NOTE We can see that the half-week element stems from the fact that the weekend differs
# NOTE from the rest of the week, giving a two-days block which repeats weekly.

# NOTE We add signals corresponding to the frequencies identified above, that is:
# NOTE half-week, week, month, year

df = (
    df.with_columns(
        ts=pl.col("date").dt.timestamp(time_unit="ms") / (24 * 60 * 60 * 1000)
    )
    .with_columns(
        (2 * np.pi * pl.col("ts") / 3.5).sin().alias("sin_hwk"),  # half-week
        (2 * np.pi * pl.col("ts") / 3.5).cos().alias("cos_hwk"),
        (2 * np.pi * pl.col("ts") / 7).sin().alias("sin_wk"),  # week
        (2 * np.pi * pl.col("ts") / 7).cos().alias("cos_wk"),
        (2 * np.pi * pl.col("ts") / 30.5).sin().alias("sin_mth"),  # month
        (2 * np.pi * pl.col("ts") / 30.5).cos().alias("cos_mth"),
        (2 * np.pi * pl.col("ts") / 365.25).sin().alias("sin_yr"),  # year
        (2 * np.pi * pl.col("ts") / 365.25).cos().alias("cos_yr"),
    )
    .drop("ts")
)

## Summary

In [67]:
# NOTE We reorder the columns, choosing to have the 'sales' first,
# NOTE followed by the 'transactions' as these two columns have not values on the target.

df = df.select(
    "date",
    "sales",  # target variable
    "transactions",  # no target values
    "onpromotion",
    cs.contains("nbr"),  # key-defining variables
    "store_type",
    "store_cluster",
    "dcoilwtico",  # oil prices (shared)
    cs.contains("sin", "cos"),  # time signals (shared)
)

In [69]:
# NOTE print the schema to see the column types
# NOTE there is no distinction between integer types

df.schema

Schema([('date', Date),
        ('sales', Float64),
        ('transactions', Float64),
        ('onpromotion', Int64),
        ('store_nbr', Int64),
        ('family_nbr', UInt32),
        ('store_type', UInt32),
        ('store_cluster', Int64),
        ('dcoilwtico', Float64),
        ('sin_hwk', Float64),
        ('cos_hwk', Float64),
        ('sin_wk', Float64),
        ('cos_wk', Float64),
        ('sin_mth', Float64),
        ('cos_mth', Float64),
        ('sin_yr', Float64),
        ('cos_yr', Float64)])

---

In [None]:
# TODO Put this into class

In [81]:
features = df.select(cs.numeric()).columns

In [82]:
# NOTE partition the numeric columns of the dataframe into 'key dataframes'
# NOTE which each contain the data of a single key (= (store_nbr, family_nbr) pair)
# NOTE `maintain_order` ensures that the list follows the order of the orignal dataframe
# NOTE i.e. by key = (store_nbr, family_nbr) in ascending order
# NOTE NOTE The 'store_nbr' in the origninal dataframes are ordered wrong,
# NOTE NOTE they are ordered by first digit, i.e. 1, 10, 11, ..., 9

kdfs = df.select(cs.numeric()).partition_by(
    by=["store_nbr", "family_nbr"], maintain_order=True, include_key=True
)

In [83]:
# NOTE The original dataframe is missing entries for Dec. 25 (4 rows per key)
# NOTE This indicates that the stores are likely closed on this date.
# NOTE We choose to ignore these as it is unlikely to have impact,
# NOTE as our taget does not contain this date (target is in August).
# NOTE We could also add the date and interpolate to smoothen the training data.

In [84]:
# stack the kdfs in a tensor with axes = (key, time, feature)
data = tf.stack([tf.constant(kdf, dtype=tf.float32) for kdf in kdfs], axis=0)

data.shape  # print shape

TensorShape([1782, 1700, 16])

---

In [4]:
# NOTE pivot to have columns for each key and a single row per time-step

pdf = df.pivot(
    index="date",
    values=["sales", "onpromotion", "store_nbr", "family_nbr"],
    on=["store_nbr", "family_nbr"],
)

# NOTE The training dataframe is missing entries for Dec. 25 (every year).
# NOTE This likely indicates that the stores are closed on that date.
# NOTE We add the corresponding rows to the dataframe and make the choice
# NOTE to fill the values by interpolation, rather than setting them to zero.
# NOTE This makes the training data smoother and should thus help during training
# NOTE (albeit very slightly), while not causing issue on our target, which is in August.

pdf = pdf.set_sorted("date").upsample("date", every="1d").interpolate()

# TODO add shared covariates (metadata)

# NOTE Normalize the 'store_nbr' and 'family_nbr' columns
# NOTE As we know all values in these columns, this can be done easily
pdf = pdf.with_columns(
    (cs.contains(col) - df[col].mean()) / df[col].std()
    for col in ["store_nbr", "family_nbr"]
)

# cast all numeric columns to float32 in preparation for the models
pdf = pdf.with_columns(cs.numeric().cast(pl.Float32))

In [250]:
class DataContainer:
    def __init__(self, pdf: pl.DataFrame, split: tuple[float, float] = (0.7, 0.9)):
        steps = len(pdf.head(-TARGET_STEPS))  # steps excluding target
        idx = [int(steps * val) for val in split]  # indices of start of valid/test sets

        self.train = pdf.head(idx[0])
        self.valid = pdf.head(idx[1]).tail(-idx[0])
        self.test = pdf.head(-TARGET_STEPS).tail(-idx[1])
        self.target = pdf.tail(TARGET_STEPS)

        # compute the mean and standard deviation on the training set
        self.mean = self.train.select(cs.contains("sales", "onpromotion")).mean()
        self.std = self.train.select(cs.contains("sales", "onpromotion")).std()

        # clip small values of std to avoid division by values close to zero
        self.std = self.std.with_columns(
            pl.when(pl.col(col) < 0.1).then(1.0).otherwise(pl.col(col)).alias(col)
            for col in self.std.columns
        )

        # scale all dataframes using the training mean and std ('sales' and 'onpromotion' only)
        (self.train, self.valid, self.test, self.target) = (
            _df.with_columns(
                (pl.col(col) - self.mean[col]) / self.std[col]
                for col in self.mean.columns
            )
            for _df in (self.train, self.valid, self.test, self.target)
        )


data = DataContainer(pdf)