In [2]:
import os
import warnings
from collections.abc import Callable

import keras
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import polars.selectors as cs
import tensorflow as tf
from keras.optimizers import Adam
from plotly.subplots import make_subplots
from tensorflow.keras.losses import MSLE, Loss

print("TensorFlow version:", tf.version.VERSION)

plotly.offline.init_notebook_mode(connected=True)  # for nbviewer
plotly.io.templates.default = "plotly_dark"

TensorFlow version: 2.16.2


In [3]:
INPUT_STEPS = 32  # length of input sequences (chosen)
TARGET_STEPS = 16  # length of the target sequences (from competition)

In [4]:
# read the dataframe created in the previous notebook
df = pl.read_csv(os.path.join("input", "df.csv"), try_parse_dates=True)

# cast the categorical columns to `pl.Categorical` (and hide the warning)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df = df.with_columns(
        pl.col("store_nbr").cast(pl.String).cast(pl.Categorical),
        pl.col("family").cast(pl.Categorical),
        pl.col("store_type").cast(pl.Categorical),
        pl.col("store_cluster").cast(pl.String).cast(pl.Categorical),
    )

In [5]:
# drop columns that we will not use
df = df.drop("transactions", "store_type", "store_cluster")

In [6]:
df.schema

Schema([('date', Date),
        ('sales', Float64),
        ('onpromotion', Int64),
        ('dcoilwtico', Float64),
        ('sin_wk', Float64),
        ('cos_wk', Float64),
        ('sin_mth', Float64),
        ('cos_mth', Float64),
        ('sin_yr', Float64),
        ('cos_yr', Float64),
        ('store_nbr', Categorical(ordering='physical')),
        ('family', Categorical(ordering='physical'))])

# Prepare the data for training
---

In [7]:
# NOTE Pivot the dataframe to have one "sales" and "onpromotion" column per key
# NOTE There are 1782 = (33 * 54) keys and 8 shared covariates, for a total of
# NOTE 1782 * 2 + 8 = 3572 columns.

pdf = df.pivot(
    on=["store_nbr", "family"],
    index=["date", "dcoilwtico", cs.contains("cos", "sin")],
    values=["sales", "onpromotion"],
)

# store the indices of the "sales" columns and the shared covariates columns
sales_idx = [pdf.get_column_index(col.name) for col in pdf.select(cs.contains("sales"))]
shared_idx = [
    pdf.get_column_index(col.name)
    for col in pdf.select(~cs.contains("sales", "onpromotion"))
]

pdf.shape  # -> [time_steps, features]

(1700, 3572)

## Data splitting and scaling

In [8]:
class DataTensor:
    def __init__(self, pdf: pl.DataFrame, split: tuple[float, float] = (0.7, 0.2)):
        self.data = tf.constant(pdf, dtype=tf.float32)

        train_data, _ = tf.split(self.data, [-1, TARGET_STEPS], axis=0)

        # compute mean and std of training data for normalization
        self.mean = tf.reduce_mean(train_data, axis=0, keepdims=True)
        self.std = tf.math.reduce_std(train_data, axis=0, keepdims=True)
        self.std = tf.where(self.std < 0.1, tf.ones_like(self.std), self.std)

        # compute the number of time-steps in train/valid/test/target sets
        split_steps = [int(len(train_data) * spl) for spl in split]
        split_steps += [len(train_data) - sum(split_steps), TARGET_STEPS]

        subset_name = ["train", "valid", "test", "target"]
        subset_data = tf.split(self.data, split_steps, axis=0)
        self.subsets = dict(zip(subset_name, subset_data))

    def get(self, subset: str, norm: bool) -> tf.Tensor:
        data = self.subsets[subset]
        return (data - self.mean) / self.std if norm else data


In [9]:
dt = DataTensor(pdf)

## Datasets

In [10]:
class WindowDatasets:
    def __init__(
        self,
        dt: DataTensor,
        input_steps: int,
        split: Callable,
        batch_size: int = 32,
    ):
        self._dt = dt
        self.input_steps = input_steps
        self.window_steps = input_steps + TARGET_STEPS
        self._split = split
        self.batch_size = batch_size

    def make(self, subset: str) -> tf.data.Dataset:
        # build a dataset from the selected subset (normalized)
        # -> card = time-steps, spec = [features]
        ds = tf.data.Dataset.from_tensor_slices(self._dt.get(subset, norm=True))

        # window the dataset along the time axis
        # -> card = windows, spec = [window_steps, features]
        ds = ds.window(size=self.window_steps, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda window: window.batch(self.window_steps))

        ds = ds.shuffle(1000).batch(self.batch_size)
        ds = ds.map(
            lambda xs: self._split(xs, self.input_steps),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        ds = ds.repeat().take(self.length(subset))  # set the cardinality

        return ds.prefetch(tf.data.AUTOTUNE)

    def length(self, subset: str) -> int:
        time_steps = len(self._dt.get(subset, norm=True))
        window_count = time_steps - self.window_steps + 1

        return int(np.ceil(window_count / self.batch_size))

# Simple models
---

In [11]:
# NOTE We split a window along the time axis into two parts:
# NOTE head (past time-steps) and tail (target time-steps).
# NOTE The 'simple' models use only values from the head to predict the tail "sales" values.

In [12]:
# NOTE Pass the input steps for using in `split` instead of -1,
# NOTE otherwise the element_spec does not show the number of time-steps.


@tf.function
def spl_split(xs: tf.Tensor, input_steps: int) -> tuple[tf.Tensor, tf.Tensor]:
    # split the windows along the time axis into (head, tail)
    head, tail = tf.split(xs, [input_steps, TARGET_STEPS], axis=1)

    # extract the tail "sales" values to use as label
    tail_sales = tf.gather(tail, indices=sales_idx, axis=-1)

    return head, tail_sales

In [13]:
spl_wds = WindowDatasets(dt, input_steps=INPUT_STEPS, split=spl_split)

spl_wds.length("train"), spl_wds.length("valid"), spl_wds.length("test")

(36, 10, 4)

In [14]:
spl_wds.make("train").element_spec

(TensorSpec(shape=(None, 32, 3572), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 16, 1782), dtype=tf.float32, name=None))

##  Baseline

In [15]:
# NOTE Use the most recent values of the input while matching the weekdays.


class Baseline(keras.Model):
    def __init__(self):
        super().__init__()

    def call(self, xs: tf.Tensor) -> tf.Tensor:
        _, time_steps, _ = tf.split(xs, [-1, TARGET_STEPS, 5], axis=1)
        vals = tf.gather(time_steps, indices=sales_idx, axis=-1)

        return vals

In [16]:
baseline = Baseline()
baseline.compile(loss="mse")
baseline.evaluate(spl_wds.make("test"), verbose=0)

1.647246241569519

## Simple dense model

In [18]:
spl_dense = keras.Sequential(
    [
        keras.layers.Dense(512, activation="relu"),
        keras.layers.Dense(512, activation="relu"),
        keras.layers.Flatten(),
        keras.layers.Dense(TARGET_STEPS * 1782),
        keras.layers.Reshape([TARGET_STEPS, 1782]),
    ]
)

In [19]:
spl_dense.compile(optimizer=Adam(learning_rate=1e-4), loss="mse")
spl_dense.evaluate(spl_wds.make("test"), verbose=0)

2.1824588775634766

In [20]:
# make a copy (including weights) to compare before/after training
spl_dense_unt = keras.models.clone_model(spl_dense)
spl_dense_unt.set_weights(spl_dense.get_weights())

In [None]:
spl_dense.fit(x=spl_wds.make("train"), validation_data=spl_wds.make("valid"), epochs=5)

In [None]:
spl_dense.evaluate(spl_wds.make("test"), verbose=0)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 1.8754


1.8732141256332397

## Simple LSTM model

In [40]:
spl_lstm = keras.Sequential(
    [
        keras.layers.LSTM(256, activation="relu", return_sequences=True),
        keras.layers.LSTM(256, activation="relu", return_sequences=False),
        keras.layers.Flatten(),
        keras.layers.Dense(TARGET_STEPS * 1782),
        keras.layers.Reshape([TARGET_STEPS, 1782]),
    ]
)

In [None]:
spl_lstm.compile(optimizer=Adam(learning_rate=1e-5), loss="mse")
spl_lstm.evaluate(spl_wds.make("test"), verbose=0)

In [None]:
spl_lstm.fit(spl_wds.make("train"), validation_data=spl_wds.make("valid"), epochs=5)

In [None]:
# make a copy (including weights) to compare before/after training
spl_lstm_unt = keras.models.clone_model(spl_lstm)
spl_lstm_unt.set_weights(spl_lstm.get_weights())

In [42]:
spl_lstm.evaluate(spl_wds.make("test"))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 1.5825


1.5910042524337769

# Coupling models
---

In [59]:
# NOTE "Coupling" models add an additional (dense) network whose role it is to couple the
# NOTE output of a simple model with the known shared covariates on the window tail.
# NOTE These models have two inputs and two (sequential) models.

In [17]:
# NOTE Define a new split function adapted from the simple split function `spl_split`.
# NOTE The difference is that we now return the shared covariates on the tail as part of the input.


@tf.function
def cpl_split(
    xs: tf.Tensor, input_steps: int
) -> tuple[tuple[tf.Tensor, tf.Tensor], tf.Tensor]:
    # split the windows along the time axis into (head, tail)
    head, tail = tf.split(xs, [input_steps, TARGET_STEPS], axis=1)

    # extract the tail "sales" values to use as label
    tail_sales = tf.gather(tail, indices=sales_idx, axis=-1)

    # extract the tail shared covariates
    tail_shared = tf.gather(tail, indices=shared_idx, axis=-1)

    return (head, tail_shared), tail_sales

In [18]:
cpl_wds = WindowDatasets(dt, input_steps=INPUT_STEPS, split=cpl_split)

In [19]:
# NOTE The `spl_model` must be built BEFORE being passed to this function


def make_cpl_model(spl_model: keras.Model) -> keras.Model:
    # create a copy of the simple model to not modify the orignial
    spl_model = keras.models.clone_model(spl_model)

    # create the coupling network
    coupnet = keras.Sequential(
        [
            # keras.layers.Dense(len(sales_idx), activation="relu"),
            keras.layers.Dense(len(sales_idx)),
        ]
    )

    # create the two inputs: head (all features) and tail (shared features)
    head = keras.Input(shape=[None, len(pdf.columns)], dtype=tf.float32)
    tail = keras.Input(shape=[TARGET_STEPS, len(shared_idx)], dtype=tf.float32)

    spl_preds = spl_model(head)
    concat = keras.layers.Concatenate(axis=-1)([spl_preds, tail])
    preds = coupnet(concat)

    return keras.Model(inputs=(head, tail), outputs=preds)

## Coupling dense model

In [32]:
cpl_dense = make_cpl_model(spl_dense)

In [34]:
cpl_dense.compile(optimizer=Adam(learning_rate=1e-4), loss="mse")
cpl_dense.fit(cpl_wds.make("train"), validation_data=cpl_wds.make("valid"), epochs=5)

In [35]:
cpl_dense.evaluate(cpl_wds.make("test"))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 1.7099


1.700713038444519

# Evaluation
---

In [45]:
# NOTE Compute the loss per key for a given model on the last window of the test set (most recent data).


def evaluate(
    model: keras.Model, input_steps: int, split_fn: Callable, loss: Loss = MSLE
) -> np.ndarray:
    window_steps = input_steps + TARGET_STEPS

    # get scaled and unscaled data of the last window of test set
    _, inputs = tf.split(dt.get("test", norm=True), [-1, window_steps], axis=0)
    _, target = tf.split(dt.get("test", norm=False), [-1, TARGET_STEPS], axis=0)

    # select only the features corresponding to "sales" columns
    target = tf.gather(target, indices=sales_idx, axis=-1)

    # add batch axis and split
    inputs, _ = split_fn(tf.expand_dims(inputs, axis=0), input_steps)

    # predict and remove batch axis -> [TARGET_STEPS, keys]
    preds = tf.squeeze(model(inputs))

    # unscale the predictions
    mean = tf.gather(dt.mean, indices=sales_idx, axis=-1)
    std = tf.gather(dt.std, indices=sales_idx, axis=-1)
    preds = preds * std + mean

    # transpose to have [keys, TARGET_STEPS] and compute the loss
    return loss(tf.transpose(preds), tf.transpose(target)).numpy()

In [80]:
losses = dict()

losses["baseline"] = evaluate(baseline, input_steps=INPUT_STEPS, split_fn=spl_split)

losses["spl_dense_unt"] = evaluate(
    spl_dense_unt, input_steps=INPUT_STEPS, split_fn=spl_split
)

losses["spl_dense"] = evaluate(spl_dense, input_steps=INPUT_STEPS, split_fn=spl_split)
# losses["spl_lstm"] = evaluate(spl_lstm, input_steps=INPUT_STEPS, split_fn=spl_split)

# losses["cpl_dense"] = evaluate(cpl_dense, input_steps=INPUT_STEPS, split_fn=cpl_split)

pl.DataFrame(losses).describe([0.5, 0.9])[2:]


statistic,baseline,spl_dense_unt,spl_dense
str,f64,f64,f64
"""mean""",0.400675,1.152542,1.166112
"""std""",1.474968,2.603318,4.145085
"""min""",0.0,0.001854,0.000294
"""50%""",0.164079,0.441517,0.320601
"""90%""",0.740266,2.504551,2.082433
"""max""",24.951469,41.689857,71.937698
