In [48]:
%load_ext autoreload
%autoreload 2
import contex

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
from series_datasets import GroupedSeriesDS
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [62]:
df = pd.read_parquet("../data/stallion.parquet")
# add time index
df["time_idx"] = df["date"].dt.year * 12 + df["date"].dt.month
df["time_idx"] -= df["time_idx"].min()

# add additional features
df["month"] = df.date.dt.month.astype(str).astype("category")  # categories have be strings
df["log_volume"] = np.log(df.volume + 1e-8)
df["avg_volume_by_sku"] = df.groupby(["time_idx", "sku"], observed=True).volume.transform("mean")
df["avg_volume_by_agency"] = df.groupby(["time_idx", "agency"], observed=True).volume.transform("mean")
time_idx="time_idx"
target="volume"
group_ids=["agency", "sku"]
df = df.sort_values(by="time_idx")
df = df[[time_idx, target] + group_ids]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,time_idx,volume,agency,sku
0,0,52.272,Agency_22,SKU_01
1,0,3324.2697,Agency_32,SKU_04
2,0,110.7,Agency_22,SKU_02
3,0,0.0,Agency_58,SKU_23
4,0,28.32,Agency_48,SKU_07
5,0,238.5387,Agency_22,SKU_05
6,0,0.0,Agency_58,SKU_17
7,0,126.36,Agency_31,SKU_01
8,0,475.790396,Agency_48,SKU_02
9,0,1.1502,Agency_40,SKU_04


In [63]:
len(df)

21000

In [88]:
group_cols = ['agency', 'sku']
cat_to_int = {}
for col in group_cols:
    cat_to_int[col] = {cat:i for i, cat in enumerate(df[col].unique())}
    df[col+'_conv'] = df[col].map(cat_to_int[col])

In [89]:
df.head()

Unnamed: 0,time_idx,volume,agency,sku,agency_conv,sku_conv
0,0,52.272,Agency_22,SKU_01,0,0
1,0,3324.2697,Agency_32,SKU_04,1,1
2,0,110.7,Agency_22,SKU_02,0,2
3,0,0.0,Agency_58,SKU_23,2,3
4,0,28.32,Agency_48,SKU_07,3,4


In [98]:
ds = GroupedSeriesDS.from_dataframe(df,
                                    group_cols=['agency_conv', 'sku_conv'],
                                    target_cols=['volume'],
                                    num_covariate_cols=['time_idx'],
                                    keep_groups=True,
                                    lagged_window=3,
                                    horizon=3
                                   )

In [99]:
len(ds)

19250

In [100]:
itm = ds[0]
itm

{'lagged_categorical_covariates': tensor([[13,  0],
         [13,  0],
         [13,  0]]),
 'categorical_covariates': tensor([[13,  0],
         [13,  0],
         [13,  0]]),
 'lagged_numerical_covariates': tensor([[0.],
         [1.],
         [2.]], dtype=torch.float64),
 'numerical_covariates': tensor([[3.],
         [4.],
         [5.]], dtype=torch.float64),
 'lagged_targets': tensor([[ 80.6760],
         [ 98.0640],
         [133.7040]], dtype=torch.float64),
 'targets': tensor([[147.3120],
         [175.6080],
         [180.7920]], dtype=torch.float64)}

In [101]:
itm['lagged_categorical_covariates'].int()

tensor([[13,  0],
        [13,  0],
        [13,  0]], dtype=torch.int32)

In [102]:
max(cat_to_int['sku'], key=cat_to_int['sku'].get), max(cat_to_int['sku'].items(), key=lambda x: x[1])

('SKU_26', ('SKU_26', 24))

In [103]:
max(cat_to_int['sku'].values())

24

In [109]:
itm['lagged_categorical_covariates'][:, 0] = 0

In [112]:
itm['lagged_categorical_covariates']

tensor([[0, 0],
        [0, 0],
        [0, 0]])

In [110]:
emb = nn.Embedding(max(cat_to_int['sku'].values()), 2)

In [113]:
emb(itm['lagged_categorical_covariates'])

tensor([[[-1.1855,  1.9368],
         [-1.1855,  1.9368]],

        [[-1.1855,  1.9368],
         [-1.1855,  1.9368]],

        [[-1.1855,  1.9368],
         [-1.1855,  1.9368]]], grad_fn=<EmbeddingBackward0>)

In [None]:
class PoissonReg(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(in_features, 200),
            nn.Dropout(0.3),
            nn.Linear(200, out_features),
            nn.Softplus() # nn.ReLU()
        )

    def forward(self, x):
        return self.linear_stack(x)
    
    def loss(self, x, y):
        pred = self(x)
        #loss = nn.MSELoss()(pred, y).mean()
        dist = torch.distributions.Poisson(pred)
        loss =-dist.log_prob(y).mean()
        return loss
    
def train_loop(dataloader, model, optimizer):
    size = len(dataloader.dataset)
    loss_vals=  []
    for batch, sample in enumerate(loader):
        X = sample['x']
        y = sample['y']
        # Compute prediction and loss
        loss = model.loss(X, y)
        loss_vals.append(loss.item())
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss_vals

In [None]:
loader = DataLoader(train_set, batch_size=64, shuffle=True)
mdl = PoissonReg(train_set.in_dim, train_set.out_dim).to(device)
optim = torch.optim.Adam(mdl.parameters(), lr=1e-3)

epochs = 1
patience = 100
epoch_losses = []
max_epochs = 1000
while True:
    # print(f"Epoch {t+1}\n-------------------------------")
    losses = train_loop(loader, mdl, optim)
    epoch_losses.append(np.average(losses))
    if len(epoch_losses) > patience:
        if min(epoch_losses) < min(epoch_losses[-patience:]):
            print('Break', epochs, min(epoch_losses), min(epoch_losses[-patience:]))
            break
    if epochs > max_epochs:
        break
    epochs +=1

plt.plot(epoch_losses)
y_pred = mdl(torch.from_numpy(X).float()).detach().numpy()
print(y_pred.mean(), y['n_orders_created'].mean())