# M4 Dataset Benchmark Code
> Generic code to experiment and produce the final benchmark.py codes \


The minimum numbers of observations in the training
- 13 for yearly
- 16 for quarterly
- 42 for monthly
- 80 for weekly
- 93 for daily
- 700 for hourly series.



Cada batch é composto de sequencias de diferentes séries, com no máximo block_size steps. \
Caso a serie possuir menos de block_size instantes, realiza-se o padding

In [1]:
# python
import numpy as np
# ml
from sklearn.preprocessing import MinMaxScaler
import mlflow
# local
from models.benchmark import NaivePredictor
from models.cnn import SimpleCNN
from models.transformer import VanillaTransformer, DecoderOnlyTransformer
from utils.plot import plot_predictions
from utils.ml import EarlyStopperPercent
from experiment import Experiment
from utils.m4 import smape, mase
from utils.m4 import load_m4_data
import pandas as pd, numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler

## Experiment Definition

In [2]:
TRACK = False
PAD = -20
model_name = ['cnn','naive', 'vanilla_transformer','decoder_transformer'][3]
run_sp = 'Weekly'

In [3]:
def get_model(model_name, model_conf):
    if model_name == 'cnn':
        return SimpleCNN(model_conf['block_size'], model_conf['d_model'])
    elif model_name == 'naive':
        return NaivePredictor()
    elif model_name == 'vanilla_transformer':
        return  VanillaTransformer(model_conf)
    elif model_name == 'decoder_transformer':
        return  DecoderOnlyTransformer(model_conf)
    else: 
        raise Exception('Undefined Model')
    
    
if TRACK:
    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
    mlflow.set_experiment(f"M4Benchmark {model_name}")
    mlflow.set_experiment_tag('model', model_name)


In [4]:
def get_x_y(data, block_size):
    if len(data) > block_size:
        idx_start = torch.arange(0, len(data)-block_size)
        # idx_start = torch.randint(len(data)-block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in idx_start])
        y = torch.stack([data[i+1:i+block_size+1] for i in idx_start])
        x_pad = (x == PAD)
        return x, y, x_pad
    else: # need to pad
        x = np.pad(data, (0, block_size-len(data)), constant_values=PAD).reshape(1, -1)# batch
        # y = data[1:].reshape(1, -1)# batch
        y = np.pad(data[1:], (0, block_size-len(data[1:])), constant_values=PAD).reshape(1, -1)# batch
        x_pad = (x == PAD)
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32), torch.tensor(x_pad, dtype=torch.float32)
#
# Test
#
def test_get_x_y():
    data = torch.arange(1,200)
    x, y, x_pad = get_x_y(data, block_size=10)
    assert(x[0, 1] == y[0, 0]) # 

    x, y, x_pad = get_x_y(data, block_size=1000)
    assert (x.shape == (1, 1000))
    assert (x[0, len(data)+1] == PAD)

test_get_x_y() 

In [5]:

class MultiSerieGenerator():
    def __init__(self, freq, device, verbose=False):
        self.device = device
        self.verbose = verbose
        if verbose: print('Loading M4 Data...')
        self.data_dict, self.df_info = load_m4_data(freq)
        if verbose: print('Loaded:')
        for SP in freq:print(f"    => {SP} has {self.data_dict[SP]['num']} series")
    
    def get_batches(self, block_size, n_series=None, random=False, seed=None):
        if seed is not None:
            np.random.seed(seed)
        df_info, data_dict = self.df_info, self.data_dict
        if n_series is None:
            n_series = len(df_info)
        else:
            n_series = min(n_series, len(df_info))
        #
        if random:
            idx = np.random.randint(low=0, high=len(df_info), size=n_series)
        else:
            idx = range(n_series)
        if self.verbose: print(f'Generating {len(idx)} series..')
        # faz o scalind individual das series completas
        scaler = MinMaxScaler((-1, 1))
        batch_x, batch_y, batch_masks = [], [], []
        for serie_index in idx:
            
            serie_info = df_info.iloc[serie_index]
            serie_id = serie_info.M4id
            if self.verbose: print(serie_id, end=', ')
            serie_sp = serie_info.SP
            train_df = data_dict[serie_sp]['train']
            
            # the V1 column is the name of the serie
            train_serie = train_df[train_df.V1 == serie_id].dropna(axis=1).values.reshape(-1)[1:]
            #
            train_serie = scaler.fit_transform(np.asarray(train_serie, dtype=np.float32).reshape(-1, 1)).reshape(-1)
            train_serie = torch.tensor(train_serie, dtype=torch.float32)
            x, y, x_pad = get_x_y(train_serie, block_size=block_size)
            batch_x.append(x), batch_y.append(y), batch_masks.append(x_pad)
        #
        batch_x = torch.vstack(batch_x).unsqueeze(-1).to(self.device)
        batch_y = torch.vstack(batch_y).unsqueeze(-1).to(self.device)
        batch_masks = torch.vstack(batch_masks).to(self.device)

        return batch_x, batch_y, batch_masks#TransformerDataset(all_enc_x, all_dec_x, all_tgt_y)


In [6]:
from models.transformer import DecoderOnlyTransformer
from utils.ml import DecoderDataset

In [7]:
#
# Inicializations
#
block_size = 512
n_series = 500
#
# Model Hiperparams
#
model = DecoderOnlyTransformer({
    'd_model': 32, 
    'num_heads': 4, 
    'num_layers': 4,
    'dim_feedforward':128,
    'block_size':block_size,
    'device':'cuda',
    'pad_token':PAD
}).to('cuda')
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Num of weights:',pytorch_total_params)
train_dataset = DecoderDataset(*MultiSerieGenerator(['Weekly'], device='cuda',verbose=True).get_batches(block_size, n_series))

Num of weights: 67297
Loading M4 Data...
Loaded:
    => Weekly has 359 series
Generating 359 series..
W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W16, W17, W18, W19, W20, W21, W22, W23, W24, W25, W26, W27, W28, W29, W30, W31, W32, W33, W34, W35, W36, W37, W38, W39, W40, W41, W42, W43, W44, W45, W46, W47, W48, W49, W50, W51, W52, W53, W54, W55, W56, W57, W58, W59, W60, W61, W62, W63, W64, W65, W66, W67, W68, W69, W70, W71, W72, W73, W74, W75, W76, W77, W78, W79, W80, W81, W82, W83, W84, W85, W86, W87, W88, W89, W90, W91, W92, W93, W94, W95, W96, W97, W98, W99, W100, W101, W102, W103, W104, W105, W106, W107, W108, W109, W110, W111, W112, W113, W114, W115, W116, W117, W118, W119, W120, W121, W122, W123, W124, W125, W126, W127, W128, W129, W130, W131, W132, W133, W134, W135, W136, W137, W138, W139, W140, W141, W142, W143, W144, W145, W146, W147, W148, W149, W150, W151, W152, W153, W154, W155, W156, W157, W158, W159, W160, W161, W162, W163, W164, W165, W166, W167, W168

In [8]:
# model = torch.load('decoder_only_weekly_1.model')

In [9]:
batch_size = 800
epochs = 100
lr = 1e-3
train_conf = {
    'epochs':epochs,
    'lr':lr, 
    'batch_size':batch_size,
    'verbose':True, # stop training if loss dont decrease 0.5% 5 consecutive steps
    # 'early_stop':EarlyStopperPercent(patience=5, min_percent=0.005, verbose=False),
    'train_dataset':train_dataset
}
model.fit(train_conf) 

Starting train. 272 batches 216888/800
Epoch 1/100 [497.712secs] -> Train loss: 0.04742
Epoch 2/100 [501.675secs] -> Train loss: 0.01139
Epoch 3/100 [506.052secs] -> Train loss: 0.00809
Epoch 4/100 [506.472secs] -> Train loss: 0.00696
Epoch 5/100 [506.294secs] -> Train loss: 0.00636
Epoch 6/100 [505.678secs] -> Train loss: 0.00599
Epoch 7/100 [504.128secs] -> Train loss: 0.00576


KeyboardInterrupt: 

## Test

In [None]:
from utils.m4 import M4DatasetGenerator
m4_data = M4DatasetGenerator(['Weekly'])
scaler = MinMaxScaler((-1,1))


@torch.no_grad()
def predict(self, src, forecast_horizon):
    self.eval()
    src = src[:, -self.block_size:, :].clone()
    h_len = src.shape[1]
    
    for i in range(forecast_horizon):
        x = src[:, -self.block_size:, :]
        y = self(x)[:, -1:, :]
        src = torch.concat((src, y), dim=1)
    return src[:, h_len:, :]

for train_serie, test_serie, serie_id, fh, eq, serie_sp in m4_data.generate(n_series=1, random=False):
    print(serie_id)
    scaler.fit(train_serie.reshape(-1, 1))    
    x = scaler.transform(train_serie.reshape(-1, 1)).reshape(1, -1, 1)
    x = torch.tensor(x, dtype=torch.float32).to('cuda')
    # predict(model, x, len(test_serie))
    pred_y = predict(model, x, len(test_serie))#.cpu().numpy()
    print(pred_y.shape, test_serie.shape)
    # # pred_y = model.predict(x, len(test_serie)).cpu().numpy()
    # pred_y = scaler.inverse_transform(pred_y.reshape(-1,1)).reshape(-1)
    # plot_predictions(train_serie, test_serie, pred_y)

Loading M4 Data...
Loaded:
    => Weekly has 359 series
W1
torch.Size([1, 13, 1]) (13,)


In [None]:
block_size = 512
m = MultiSerieGenerator(['Weekly'], device='cpu', verbose=True)
X, M = m.get_batches(block_size=block_size, n_series=2)
print(X.shape)

Loading M4 Data...
Loaded:
    => Weekly has 359 series
Generating 2 series..
W1, W1, 

ValueError: too many values to unpack (expected 2)

In [None]:
# test
last_train_values = train_serie[-block_size:]
pred_y = exp.predict(last_train_values, fh)

# check if negative or extreme (M4)
pred_y[pred_y < 0] = 0
pred_y[pred_y > (1000 * np.max(train_serie))] = np.max(train_serie)

# Metrics
metrics_table['serie_id'].append(serie_id)
metrics_table['smape'].append(smape(test_serie, pred_y)*100)
metrics_table['mase'].append(mase(train_serie, test_serie, pred_y, freq))
print(f'Serie {serie_id}-{serie_sp} Finished -> smape: {smape(test_serie, pred_y)*100} | mase:{mase(train_serie, test_serie, pred_y, freq)}')
# plot_predictions(train_serie, test_serie, pred_y)

#
metrics_dict = {
'smape_mean': np.round(np.mean(metrics_table['smape'], dtype=float), 3), 
'mase_mean':  np.round(np.mean(metrics_table['mase'], dtype=float), 3),
#
'smape_std':  np.round(np.std(metrics_table['smape'], dtype=float), 3),
'mase_std':   np.round(np.std(metrics_table['mase'], dtype=float), 3),
}
if TRACK:
mlflow.log_metrics(metrics_dict)
mlflow.log_table(metrics_table, artifact_file='metrics_table')

print(f'Full Pass {1+full_pass_i:5}:', end='')
for k, v in metrics_dict.items(): print(f'      {k}: {v}', end='')
print()

IndentationError: expected an indented block after 'if' statement on line 24 (186095145.py, line 25)