# 2 - Training

In [14]:
import os
from collections.abc import Callable

import keras
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import tensorflow as tf
from plotly.subplots import make_subplots

print("TensorFlow version:", tf.version.VERSION)

plotly.offline.init_notebook_mode(connected=True)  # for nbviewer
plotly.io.templates.default = "plotly_dark"

TensorFlow version: 2.16.2


In [18]:
TARGET_STEPS = 16  # length of the target sequences

BATCH_SIZE = 64
BUFFER_SIZE = 1000  # for dataset shuffling

LOSS = keras.losses.MeanSquaredError()  # for training

In [4]:
# read the dataframe created in the previous notebook
df = pl.read_csv(os.path.join("input", "df.csv"), try_parse_dates=True)

# drop 'family' as 'family_nbr' contains the same information
df = df.drop("family")

# cast 'onpromotion' to float to distinguish it from the categorical columns
df = df.cast({"onpromotion": pl.Float64})

# convert the 'date' column to a timestamp
df = df.cast({"date": pl.Float64})

In [5]:
# NOTE Print the dataframe schema
# NOTE The last four columns (integer) are the categorical columns which
# NOTE depend on the key only.
# NOTE Recall that we call 'key' the pairs (store_nbr, family_nbr).

df.schema

Schema([('date', Float64),
        ('sales', Float64),
        ('transactions', Float64),
        ('onpromotion', Float64),
        ('dcoilwtico', Float64),
        ('sin_hwk', Float64),
        ('cos_hwk', Float64),
        ('sin_wk', Float64),
        ('cos_wk', Float64),
        ('sin_mth', Float64),
        ('cos_mth', Float64),
        ('sin_yr', Float64),
        ('cos_yr', Float64),
        ('store_nbr', Int64),
        ('family_nbr', Int64),
        ('store_type', Int64),
        ('store_cluster', Int64)])

In [6]:
# NOTE Store some values of the dataframe as constants

FEATURES = len(df.columns)
SALES_IDX = df.columns.index("sales")

# Split and normalize the data

In [7]:
# NOTE give names to the axes to avoid having numbers in the code


class Ax:
    TIME, KEY, FEATURE = 0, 1, 2

In [8]:
# TODO Move `_norm` method outside of class?


class DataTensor:
    def __init__(self, df: pl.DataFrame, split: tuple[float, float] = (0.7, 0.2)):
        # partition the dataframe by key (include_key=True by default)
        # and stack the resulting dataframes into a tensor with axes determined by Ax
        kdfs = df.partition_by(by=["store_nbr", "family_nbr"], maintain_order=True)
        kdfs = [tf.constant(kdf, dtype=tf.float32) for kdf in kdfs]
        data = tf.stack(kdfs, axis=Ax.KEY)

        # normalize the time- and key-features
        time_data, keys_data = tf.split(data, [-1, 4], axis=Ax.FEATURE)
        keys_data, _ = self._norm(keys_data, axis=Ax.KEY)
        time_data, (mean, std) = self._norm(time_data, axis=Ax.TIME, tail=TARGET_STEPS)
        data = tf.concat([time_data, keys_data], axis=Ax.FEATURE)

        # store the mean and std of the 'sales' column for unscaling
        sales_ind = df.columns.index("sales")
        self.mean = tf.gather(mean, [sales_ind], axis=Ax.FEATURE)
        self.std = tf.gather(std, [sales_ind], axis=Ax.FEATURE)

        # compute the number of time-steps in each subset
        tts = data.shape[Ax.TIME] - TARGET_STEPS  # time-steps without target
        steps = [int(tts * rt) for rt in split]  # train + valid
        steps += [tts - sum(steps), TARGET_STEPS]  # test + target

        # split the data into subsets and store those in `self._data`
        train, valid, test, target = tf.split(data, steps, axis=Ax.TIME)
        self._data = dict(train=train, valid=valid, test=test, target=target)

    @classmethod
    def _norm(cls, xs: tf.Tensor, axis: int, tail: int = 0):
        head, _ = tf.split(xs, [-1, tail], axis=axis)

        mean = tf.reduce_mean(head, axis=axis, keepdims=True)
        std = tf.math.reduce_std(head, axis=axis, keepdims=True)
        std = tf.where(std < 0.1, tf.ones_like(std), std)  # to avoid div. by ~0

        return (xs - mean) / std, (mean, std)

    def __getitem__(self, subset: str) -> tf.Tensor:
        return self._data[subset]

In [9]:
data = DataTensor(df)

# Create windowed datasets

In [10]:
# NOTE Class that creates windowed datasets for the subsets.
# NOTE Pass a split function which takes a batch of windows and splits them
# NOTE into (inputs, label) pairs to match whatever our model expects.
# NOTE Need to compute the length of each dataset manually since TensorFlow
# NOTE cannot compute the cardinality due to `widnow`, and not knowing the
# NOTE cardinality resluts in warnings when using `model.fit` and `model.evaluate`.

In [32]:
class WindowDatasets:
    def __init__(self, data: DataTensor, input_steps: int, split_fn: Callable):
        self._data = data
        self.input_steps = input_steps
        self.window_steps = input_steps + TARGET_STEPS
        self._split = split_fn

    def make(self, subset: str) -> tf.data.Dataset:
        # card = time-steps, spec = [keys, features]
        ds = tf.data.Dataset.from_tensor_slices(self._data[subset])

        # card = windows, spec = [window_steps, keys, features]
        ds = ds.window(size=self.window_steps, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda window: window.batch(self.window_steps))

        # card = windows, spec = [keys, window_steps, features]
        ds = ds.map(lambda xs: tf.transpose(xs, perm=[Ax.KEY, Ax.TIME, Ax.FEATURE]))

        # card = windows * keys, spec = [window_steps, features]
        ds = ds.flat_map(tf.data.Dataset.from_tensor_slices)

        ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
        ds = ds.map(
            lambda xs: self._split(xs, self.input_steps),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        ds = ds.repeat().take(self.length(subset))  # set the cardinality

        return ds.prefetch(tf.data.AUTOTUNE)

    def length(self, subset: str) -> int:
        shape = self._data[subset].shape
        windows_per_key = shape[Ax.TIME] - self.window_steps + 1
        example_count = shape[Ax.KEY] * windows_per_key

        return int(np.ceil(example_count / BATCH_SIZE))

# Simple models

In [33]:
# NOTE simple models = models using only head values as input

In [34]:
# NOTE Function that splits a batch of windows into (inputs, label) pairs.
# NOTE use the terminology: window = head + tail
# NOTE We pass the number of input steps as parameter (instead of using -1 in `split`)
# NOTE so that the resulting tensor shape is known,
# NOTE i.e. [input_steps, features] instead of [None, features] when passing -1.
# NOTE This makes the input shape known to our models, which can initialize dense
# NOTE layers without having to have the input shape specified.

# TODO test with @tf.function -> no difference?


@tf.function
def split_windows(xs: tf.Tensor, input_steps: int) -> tuple[tf.Tensor, tf.Tensor]:
    # split the windows along the time axis into (head, tail)
    head, tail = tf.split(xs, [input_steps, TARGET_STEPS], axis=1)

    # extract the tail 'sales' values to use as label
    tail_sales = tf.gather(tail, indices=[SALES_IDX], axis=-1)

    return head, tail_sales

In [35]:
wds = WindowDatasets(data, input_steps=30, split_fn=split_windows)

## Baseline

In [36]:
# NOTE Baseline model which used the past values as prediction
# NOTE Match the day of the week since we know that there is a srong weekly periodicity.


class Baseline(keras.Model):
    def __init__(self):
        super().__init__(name="baseline")

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        head_sales = tf.gather(inputs, [0], axis=-1)

        # split along time axis with shift to match the weekdays
        _, pred, _ = tf.split(head_sales, [-1, TARGET_STEPS, 5], axis=1)

        return pred

In [37]:
baseline = Baseline()
baseline.compile(loss=LOSS)


_ = baseline.evaluate(wds.make("test"))

[1m3481/3481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 629us/step - loss: 2.4790


## Simple dense model

In [47]:
simple_dense_model = keras.Sequential(
    [
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Flatten(),
        # keras.layers.Reshape([-1]),  # flatten
        keras.layers.Dense(TARGET_STEPS),
        keras.layers.Reshape([TARGET_STEPS, 1]),
    ]
)

In [48]:
simple_dense_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss=LOSS
)

In [52]:
simple_dense_model.evaluate(wds.make("test"))

[1m3481/3481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 1.3036


1.2517348527908325

In [51]:
simple_dense_model.fit(x=wds.make("train"), validation_data=wds.make("valid"), epochs=5)

Epoch 1/5
[1m31547/31547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 3ms/step - loss: 0.3413 - val_loss: 0.9319
Epoch 2/5
[1m31547/31547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 3ms/step - loss: 0.3200 - val_loss: 0.8388
Epoch 3/5
[1m31547/31547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - loss: 0.3134 - val_loss: 0.8775
Epoch 4/5
[1m31547/31547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - loss: 0.3096 - val_loss: 0.8899
Epoch 5/5
[1m31547/31547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 3ms/step - loss: 0.3078 - val_loss: 0.9064


<keras.src.callbacks.history.History at 0x18528eb70>