In [None]:
import os
import warnings
from collections.abc import Callable

import keras
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import polars.selectors as cs
import tensorflow as tf
from plotly.subplots import make_subplots
from keras.optimizers import Adam
from tensorflow.keras.losses import Loss, MSE, MSLE

print("TensorFlow version:", tf.version.VERSION)

plotly.offline.init_notebook_mode(connected=True)  # for nbviewer
plotly.io.templates.default = "plotly_dark"

TensorFlow version: 2.16.2


In [None]:
TARGET_STEPS = 16  # length of the target sequences

BATCH_SIZE = 64
BUFFER_SIZE = 1000  # for dataset shuffling

# make a directory to store the trained models
os.makedirs("models", exist_ok=True)


In [57]:
# read the dataframe created in the previous notebook
df = pl.read_csv(os.path.join("input", "df.csv"), try_parse_dates=True)

df = df.cast({"date": pl.Float64, "onpromotion": pl.Float64})

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df = df.with_columns(
        pl.col("store_nbr").cast(pl.String).cast(pl.Categorical),
        pl.col("family").cast(pl.Categorical),
        pl.col("store_type").cast(pl.Categorical),
        pl.col("store_cluster").cast(pl.String).cast(pl.Categorical),
    )

In [58]:
df = df.drop("store_type", "store_cluster")

In [59]:
df.schema

Schema([('date', Float64),
        ('sales', Float64),
        ('onpromotion', Float64),
        ('dcoilwtico', Float64),
        ('transactions', Float64),
        ('sin_wk', Float64),
        ('cos_wk', Float64),
        ('sin_mth', Float64),
        ('cos_mth', Float64),
        ('sin_yr', Float64),
        ('cos_yr', Float64),
        ('store_nbr', Categorical(ordering='physical')),
        ('family', Categorical(ordering='physical'))])

----

In [68]:
class DataTensor:
    def __init__(self, df: pl.DataFrame, split: tuple[float, float] = (0.7, 0.2)):
        # one-hot encode categorical variables
        self.df = df.to_dummies(cs.categorical())

        dfs = self.df.partition_by(
            cs.contains("store_nbr", "family"), maintain_order=True
        )

        # stack into a tensor with axes (time, key, feature)
        self.data = tf.stack([tf.constant(df, dtype=tf.float32) for df in dfs], axis=1)

        self._compute_mean_std()
        self._split(split)

    def _split(self, split: tuple[float, float]):
        train_ts = len(self.data) - TARGET_STEPS

        # compute the number of time-steps in train/valid/test/target sets
        split_steps = [int(train_ts * spl) for spl in split]
        split_steps += [train_ts - sum(split_steps), TARGET_STEPS]

        subset_name = ["train", "valid", "test", "target"]
        subset_data = tf.split(self.data, split_steps, axis=0)
        self.subsets = dict(zip(subset_name, subset_data))

    def _compute_mean_std(self):
        tmp_count = len(df.select(cs.float()).columns)  # number of temporal features
        data, _ = tf.split(self.data, [-1, TARGET_STEPS], axis=0)  # remove target

        # replace the categorical features with zeros to compute mean and std
        tmp_data, cat_data = tf.split(data, [tmp_count, -1], axis=-1)
        tmp_data = tf.concat([tmp_data, tf.zeros_like(cat_data)], axis=-1)

        self.mean = tf.reduce_mean(tmp_data, axis=0, keepdims=True)
        self.std = tf.math.reduce_std(tmp_data, axis=0, keepdims=True)
        self.std = tf.where(self.std < 0.1, tf.ones_like(self.std), self.std)

    def get(self, subset: str, norm: bool) -> tf.Tensor:
        data = self.subsets[subset]
        return (data - self.mean) / self.std if norm else data

In [69]:
dt = DataTensor(df)

In [None]:
tts = 1684

idx = [0, int(0.7 * tts), int(0.9 * tts), tts, tts + TARGET_STEPS]
idx
d = dict(train=0, valid=1, test=2, target=3)
{s: (idx[v], idx[v + 1]) for s, v in d.items()}

{'train': (0, 1178),
 'valid': (1178, 1515),
 'test': (1515, 1684),
 'target': (1684, 1700)}

# Datasets

In [119]:
@tf.function
def spl_split(xs: tf.Tensor, input_steps: int) -> tuple[tf.Tensor, tf.Tensor]:
    # split the windows along the time axis into (head, tail)
    head, tail = tf.split(xs, [input_steps, TARGET_STEPS], axis=1)

    # TODO replace hard-coded value with variable
    # extract the tail 'sales' values to use as label
    tail_sales = tf.gather(tail, indices=[1], axis=-1)

    return head, tail_sales

In [88]:
class WindowDatasets:
    def __init__(
        self,
        data: DataTensor,
        input_steps: int,
        split_fn: Callable,
        batch_size: int = BATCH_SIZE,
    ):
        self.data = data
        self.input_steps = input_steps
        self.window_steps = input_steps + TARGET_STEPS
        self.split_fn = split_fn
        self.batch_size = batch_size

    def make(self, subset: str) -> tf.data.Dataset:
        # card = time-steps, spec = [keys, features]
        ds = tf.data.Dataset.from_tensor_slices(self.data.get(subset, norm=True))

        # card = windows, spec = [window_steps, keys, features]
        ds = ds.window(size=self.window_steps, shift=1, drop_remainder=True)
        ds = ds.flat_map(lambda window: window.batch(self.window_steps))

        # card = windows, spec = [keys, window_steps, features]
        ds = ds.map(lambda xs: tf.transpose(xs, perm=[1, 0, 2]))

        # card = windows * keys, spec = [window_steps, features]
        ds = ds.flat_map(tf.data.Dataset.from_tensor_slices)

        ds = ds.shuffle(BUFFER_SIZE).batch(self.batch_size)
        ds = ds.map(
            lambda xs: self.split_fn(xs, self.input_steps),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        ds = ds.repeat().take(self.length(subset))  # set the cardinality

        return ds.prefetch(tf.data.AUTOTUNE)

    def length(self, subset: str) -> int:
        shape = self.data.get(subset, norm=False).shape
        windows_per_key = shape[0] - self.window_steps + 1
        example_count = shape[1] * windows_per_key

        return int(np.ceil(example_count / BATCH_SIZE))

In [None]:
wds = WindowDatasets(dt, input_steps=32, split_fn=spl_split)

In [92]:
wds.make("train").element_spec

(TensorSpec(shape=(None, 32, 98), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 16, 1), dtype=tf.float32, name=None))

## Baseline

In [93]:
# NOTE Baseline model which used the past values as prediction
# NOTE Match the day of the week since we know that there is a srong weekly periodicity.


class Baseline(keras.Model):
    def __init__(self):
        super().__init__(name="baseline")

    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        head_sales = tf.gather(inputs, [1], axis=-1)

        # split along time axis with shift to match the weekdays
        _, pred, _ = tf.split(head_sales, [-1, TARGET_STEPS, 5], axis=1)

        return pred

In [None]:
baseline = Baseline()
baseline.compile(loss="mse")

baseline.evaluate(wds.make("test"))

[1m3425/3425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 940us/step - loss: 1.7852


## Simple dense model

In [156]:
spl_dense = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Flatten(),
        keras.layers.Dense(TARGET_STEPS),
        keras.layers.Reshape([TARGET_STEPS, 1]),
    ]
)

In [157]:
spl_dense.compile(optimizer=Adam(learning_rate=1e-5), loss="mse")
spl_dense.evaluate(wds.make("test"))

[1m3425/3425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 13ms/step - loss: 1.7162


1.6424674987792969

In [158]:
# make a copy (including weights) to compare before/after training
spl_dense_unt = keras.models.clone_model(spl_dense)
spl_dense_unt.set_weights(spl_dense.get_weights())

In [None]:
spl_dense.fit(x=wds.make("train"), validation_data=wds.make("valid"), epochs=10)

In [104]:
spl_dense.evaluate(wds.make("test"))

[1m3425/3425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 1.2538


1.2257907390594482

In [30]:
# simple_dense_model.save("byKey_simple_dense.keras")

## Simple LSTM model

In [None]:
simple_lstm_model = keras.Sequential(
    [
        keras.layers.LSTM(512, activation="relu", return_sequences=False),
        keras.layers.Flatten(),
        keras.layers.Dense(TARGET_STEPS),
        keras.layers.Reshape([TARGET_STEPS, 1]),
    ]
)

# Evaluate
---

In [151]:
def evaluate(
    model: keras.Model, input_steps: int, split_fn: Callable, loss: Loss = MSLE
) -> np.ndarray:
    window_steps = input_steps + TARGET_STEPS

    _, inputs = tf.split(dt.get("test", norm=True), [-1, window_steps], axis=0)
    _, target = tf.split(dt.get("test", norm=False), [-1, TARGET_STEPS], axis=0)

    # use key axis as batch -> (key, time, feature)
    inputs = tf.transpose(inputs, [1, 0, 2])
    target = tf.transpose(target, [1, 0, 2])

    target = tf.gather(target, indices=[1], axis=-1)  # select "sales" values
    inputs, _ = spl_split(inputs, input_steps=input_steps)  # remove labels

    preds = model(inputs)

    sales_mean = tf.transpose(tf.gather(dt.mean, [1], axis=-1), [1, 0, 2])
    sales_std = tf.transpose(tf.gather(dt.std, [1], axis=-1), [1, 0, 2])
    preds = preds * sales_std + sales_mean

    target = tf.squeeze(target)  # remove feature axis -> [TARGET_STEPS, keys]
    preds = tf.squeeze(preds)

    return loss(target, preds).numpy()

In [155]:
pl.DataFrame(
    {
        "baseline": evaluate(baseline, input_steps=32, split_fn=spl_split),
        "spl_dense": evaluate(spl_dense, input_steps=32, split_fn=spl_split),
    }
).describe()

statistic,baseline,spl_dense
str,f64,f64
"""count""",1782.0,1782.0
"""null_count""",0.0,0.0
"""mean""",0.400675,0.81705
"""std""",1.474968,2.200993
"""min""",0.0,0.0
"""25%""",0.054581,0.080969
"""50%""",0.164079,0.258017
"""75%""",0.421484,0.551733
"""max""",24.951469,24.066505
