In [None]:
from pytorch_lightning.callbacks import EarlyStopping
import torch

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_forecasting import Baseline, DeepAR, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import SMAPE, MultivariateNormalDistributionLoss


In [None]:
import setup_notebook

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error

from grandexchange.preprocess import load_preprocessed_data

# How many steps in the future to predict
N_PREDICT_STEPS = 7

# Number of steps for the first cross validation fold
MIN_CV_SIZE = 500

### Load data

In [None]:
data = load_preprocessed_data(24)

# For now take a sample of 5 items to test model
# data = data[
#     data["item_id"].isin(
#         data["item_id"].sample(50, random_state=42)
#     )
# ]

### Cross-validation

Logic:
- Forward rolling cross validation with MIN_CV_SIZE as the first fold
- This initial training set will be the first fold and  will predict the next N_PREDICT_STEPS (test set) and store:
    1. The predicted and actual
    1. The number of days ahead the prediction was for
- The new training set will be the previous fold plus the previous test set

In [None]:
steps = data.datetime.unique()
cv_folds = {}

fold = 0

train_size = MIN_CV_SIZE
train_fold = steps[:train_size]
test_fold = steps[train_size:(train_size + N_PREDICT_STEPS)]

while train_size + N_PREDICT_STEPS <= len(steps):
    cv_folds[fold] = {
        "train": train_fold,
        "test": test_fold
    }
    fold += 1
    train_size += N_PREDICT_STEPS

### Baseline model

Predict the next days as the average of the previous 7 days

In [None]:
def run_baseline_model():
    fold_results = []

    for fold in cv_folds.keys():
        train = data[data["datetime"].isin(cv_folds[fold]["train"][-7:])]
        test = data[data["datetime"].isin(cv_folds[fold]["test"])]
        pred = train.groupby("item_id")["price"].mean().reset_index()

        preds = pd.concat([
            pred.assign(fold=x) for x in cv_folds.keys()
        ], axis=0).rename({"price": "predicted"}, axis=1)

    # Add all test datetimes
    cv_results = pd.merge(
        preds,
        test[["item_id", "datetime"]],
        on=["item_id"]
    )
    # Add actual price to each datetime and item_id
    cv_results = pd.merge(
        cv_results,
        test[["item_id", "price", "datetime"]].rename({"price": "actual"}, axis=1),
        on=["item_id", "datetime"]
    )
    # Add step
    cv_results = cv_results.merge(
        (
            cv_results[["datetime"]]
            .drop_duplicates()
            .reset_index(drop=True)
            .reset_index()
        ).rename({"index": "step"}, axis=1),
        on="datetime"
    )
    # cv_results["mape"] = mean_absolute_percentage_error(cv_results["actual"], cv_results["predicted"])
    cv_eval = cv_results.groupby(["step"]).apply(lambda x: mean_absolute_percentage_error(x["actual"], x["predicted"]))
    return cv_results.reset_index(drop=True), cv_eval.reset_index(name="MAPE")

baseline_results = run_baseline_model()

In [None]:
sns.set(rc = {'figure.figsize':(10, 6)})
sns.lineplot(data=baseline_results[1], x="step", y="MAPE")

### DeepAR

Multivariate deep learning algorithm

**Load data**

In [None]:
from pytorch_lightning.callbacks import EarlyStopping
import torch

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_forecasting import Baseline, DeepAR, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import SMAPE, MultivariateNormalDistributionLoss

data_ar = (
    data
    .assign(price=data.groupby("item_id")["price"].apply(lambda x: (x - x.mean()) / x.std()))
    .assign(time_idx=LabelEncoder().fit_transform(data["datetime"]))
    [["item_id", "price", "time_idx"]]
)

In [None]:
# Use the information from the last encode_length, to predict the max prediction_length
max_encoder_length = 48
max_prediction_length = 16

training_cutoff = data_ar["time_idx"].max() - max_prediction_length
training = TimeSeriesDataSet(
    data_ar[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="price",
    group_ids=["item_id"],
#     time_varying_unknown_reals=["margin", "volume"],
    time_varying_unknown_reals=["price"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
)

validation = TimeSeriesDataSet.from_dataset(training, data_ar, min_prediction_idx=training_cutoff + 1)
batch_size = 128

train_dataloader = training.to_dataloader(
    train=True, batch_size=batch_size, num_workers=0
)
val_dataloader = validation.to_dataloader(
    train=False, batch_size=batch_size, num_workers=0
)

**Calculate baseline error**

In [None]:
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
print(SMAPE()(baseline_predictions, actuals))

**Train network**

In [None]:
%%capture

pl.seed_everything(42)
import pytorch_forecasting as ptf

trainer = pl.Trainer(gpus=0, gradient_clip_val=1e-1)
net = DeepAR.from_dataset(
    training, learning_rate=3e-2, hidden_size=16, rnn_layers=2, loss=MultivariateNormalDistributionLoss(rank=30)
)

In [None]:
# find optimal learning rate
res = trainer.tuner.lr_find(
    net,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    min_lr=1e-5,
    max_lr=5e0,
    early_stop_threshold=100,
)
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()
net.hparams.learning_rate = res.suggestion()



In [None]:
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
trainer = pl.Trainer(
    max_epochs=30,
    gpus=0,
    gradient_clip_val=0.01,
    callbacks=[early_stop_callback],
    limit_train_batches=50
)


net = DeepAR.from_dataset(
    training,
    learning_rate=res.suggestion(),
    log_interval=10,
    log_val_interval=1,
    hidden_size=16,
    rnn_layers=2,
    loss=MultivariateNormalDistributionLoss(rank=30),
)

trainer.fit(
    net,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

In [None]:
best_model_path = trainer.checkpoint_callback.best_model_path
best_model = DeepAR.load_from_checkpoint(best_model_path)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_model.predict(val_dataloader)
(actuals - predictions).abs().mean()

raw_predictions, x = net.predict(val_dataloader, mode="raw", return_x=True, n_samples=100)

In [None]:
def get_item_name(item_id):
    key = data[["item_id", "name"]].drop_duplicates()
    return key[key["item_id"] == item_id]["name"].values

series = validation.x_to_index(x)["item_id"]
for idx in range(300):  # plot 10 examples
    print(idx)
    best_model.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True)
    plt.suptitle(f"Series: {get_item_name(series.iloc[idx])[0]}")
    plt.show()

### Temporal Fusion Transformer

In [None]:
data_ar = (
    data
    .assign(price=data.groupby("item_id")["price"].apply(lambda x: (x - x.mean()) / x.std()))
    .assign(time_idx=LabelEncoder().fit_transform(data["datetime"]))
    .drop(["datetime", "name"], axis=1)
)
data_ar.head(10)

In [None]:
max_prediction_length = 14
max_encoder_length = 30
training_cutoff = data_ar['time_idx'].max() - max_prediction_length

training = TimeSeriesDataSet(
    data_ar[lambda x: x['time_idx'] <= training_cutoff],
    time_idx='time_idx',
    target="price",
    group_ids=["item_id"],
    min_encoder_length=0,  
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=['time_idx'],
    time_varying_unknown_reals=['price', "margin", "volume"],
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

In [None]:
validation = TimeSeriesDataSet.from_dataset(training, data_ar, predict=True, stop_randomization=True)

batch_size = 256
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [None]:
from pytorch_forecasting.metrics import RMSE
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet

# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=0,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=1,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=1,  # 7 quantiles by default
    loss=RMSE(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=0.1,
    min_lr=1e-7,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()



In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger


early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-7, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  
logger = TensorBoardLogger("lightning_logs") 

trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,
    gradient_clip_val=0.1,
    limit_train_batches=30,  
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.00724,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=1, 
    loss=RMSE(),
    log_interval=10,  
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")


In [None]:
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

In [None]:
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
(actuals - predictions).abs().mean()
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)

In [None]:
for idx in range(400):  # plot 10 examples
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True)
    plt.title(f"Series: {get_item_name(series.iloc[idx])[0]}\n")
    plt.show()

In [None]:
def get_item_name(item_id):
    key = data[["item_id", "name"]].drop_duplicates()
    return key[key["item_id"] == item_id]["name"].values

series = validation.x_to_index(x)["item_id"]
for idx in range(300):  # plot 10 examples
    print(idx)
    best_model.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True)
    plt.suptitle(f"Series: {get_item_name(series.iloc[idx])[0]}")
    plt.show()