In [4]:
import pandas as pd
from utils.plot import plot_serie, plot_predictions
import torch
from torch import nn

from layers.posencoding import positional_encoding

In [5]:
def to_sample(x):
    return torch.tensor(x, dtype=torch.float32).view(1, -1, 1).to('cuda')

PAD = -1e12
def get_x_y(data, block_size, step=1):
    data = torch.tensor(data, dtype=torch.float32).to('cuda')
    if len(data) > block_size:
        idx_start = torch.arange(0, len(data)-block_size, step)
        x = torch.stack([data[i:i+block_size] for i in idx_start])
        y = torch.stack([data[i+1:i+block_size+1] for i in idx_start])
        x_pad = (x == PAD)
        return x.unsqueeze(-1), y.unsqueeze(-1), x_pad

In [6]:
from models.transformer import PositionalEncoding

In [22]:

class DecoderOnlyTransformer(torch.nn.Module):
    def __init__(self, block_size):
        super(DecoderOnlyTransformer, self).__init__()
        self.block_size = block_size
        #
        d_model = 64
        num_layers = 3
        dim_feedforward = 64
        num_heads = 8
        # norm='BatchNorm'
        
        # self.pos_emb = positional_encoding(pe=None, learn_pe=True, num_patch=1, d_model=d_model)
#nn.Embedding(num_embeddings=block_size, embedding_dim=d_model)
        self.pos_enc = positional_encoding('sincos', True, block_size, d_model) #PositionalEncoding(d_model, dropout=0)
        #
        self.decoder_embedding = nn.Linear(in_features=1, out_features=d_model)
        # self.output_layer1 = nn.Linear(in_features=d_model, out_features=d_model)
        self.output_layer = nn.Linear(in_features=d_model, out_features=1)
        self.decoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model, 
                nhead=num_heads, 
                dim_feedforward=dim_feedforward,
                dropout=0.1, batch_first=True, device='cuda'), num_layers, norm=None)
    
    def num_weights(self, ): print('Num of weights:',sum(p.numel() for p in self.parameters() if p.requires_grad))
    
    def forward(self, src, mask=None, pad_mask=None):
        # src = self.decoder_embedding(src) + self.pos_emb(torch.arange(src.shape[1]).to('cuda'))# (B, T) --> (B, T, Emb)
        src = self.decoder_embedding(src) + self.pos_enc# (B, T) --> (B, T, Emb)
        # src = self.pos_enc(self.decoder_embedding(src))
 
        pred = self.decoder(src, mask=mask, src_key_padding_mask=pad_mask)
        pred = self.output_layer(pred)
        return pred

block_size = 250
model = DecoderOnlyTransformer(block_size).to('cuda')
model.num_weights()

serie = pd.read_csv('./data/AirPassengers.csv')['#Passengers'].values
train_serie = serie[:-30]
test_serie = serie[-30:]


serie = pd.read_csv('./data/solarpanelspower/PV_Elec_Gas2.csv').rename(columns={'Unnamed: 0':'date'}).set_index('date').Elec_kW.values
serie = ((serie-serie.mean()) / serie.std()) /4
print(len(serie), serie.max(), serie.min())

train_serie = serie[:2000]
test_serie = serie[2000:]

X, Y, M = get_x_y(train_serie, block_size, step=10)
print(X.shape, model(X).shape)


Num of weights: 91841
2948 0.7492105345967958 -0.7248135836847985
torch.Size([175, 250, 1]) torch.Size([175, 250, 1])


In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

In [32]:
model.train()
for i in range(400):
      optimizer.zero_grad() # current batch zero-out the loss
      src_mask = nn.Transformer.generate_square_subsequent_mask(X.shape[1]).to('cuda')
      pred_y = model(X, mask=src_mask, pad_mask=M) # mask x is very very important!!!
      loss = loss_fn(pred_y, Y) # loss with padding
      
      loss.backward()
      optimizer.step()
      if i % 5 == 0:
        print(loss.item())


0.022890597581863403
0.02284262515604496
0.022764701396226883
0.022646794095635414
0.022677289322018623
0.022444287315011024
0.02248753234744072
0.022307511419057846
0.022212931886315346
0.022040100768208504
0.022042816504836082
0.02197580598294735
0.021952714771032333
0.02175196073949337
0.021697474643588066
0.02175058238208294
0.02155494876205921
0.021486366167664528
0.021498549729585648
0.0215382669121027
0.02143828757107258
0.021343693137168884
0.02138364128768444
0.02130691148340702
0.02124393731355667
0.021138742566108704
0.021098589524626732
0.021092616021633148
0.021055396646261215
0.020961860194802284
0.020990043878555298
0.020945465192198753
0.02093508653342724
0.020904947072267532
0.020824894309043884
0.02082512155175209
0.020819326862692833
0.020851988345384598
0.020756935700774193
0.020773230120539665
0.020764192566275597
0.02064567245543003
0.020671414211392403
0.020641935989260674
0.020695269107818604
0.02064160630106926
0.020616643130779266
0.02061108686029911
0.0206402

In [34]:
with torch.no_grad():
    model.eval()
    src = train_serie[-block_size:]
    h_len = len(src)
    src = to_sample(src)
    for i in range(500):
        # src_mask = nn.Transformer.generate_square_subsequent_mask(block_size).to('cuda')
        y = model(src[:, -block_size:, :])[:,-1:, :]
        src = torch.concat((src, y), dim=1)
    pred_y = src[:,h_len:,:].squeeze().cpu().numpy()

plot_predictions(train_serie,test_serie, pred_y)
    # plot_predictions(train_serie, test_serie, pred_y)

In [1]:
from utils.plot import plot_predictions, plot_serie
import pandas as pd
import numpy as np
from utils.timeserie import generate_time_series
from models.transformer import DecoderOnlyTransformer
from utils.ml import DecoderDataset
# from utils.m4 import get_x_y
from sklearn.preprocessing import MinMaxScaler
import torch
import tqdm

Load Data

In [2]:
# solar_power = pd.read_csv('./data/solarpanelspower/PV_Elec_Gas2.csv').rename(columns={'Unnamed: 0':'date'}).set_index('date')
# train_serie = solar_power[:'2014-10-31'].Elec_kW.values
# val_serie = solar_power['2014-11-01':'2015-11-18'].Elec_kW.values
# # test_set = solar_power['2019-11-18':]
# print('Proportion of train_set : {:.2f}%'.format(len(train_serie)/len(solar_power)))
# print('Proportion of valid_set : {:.2f}%'.format(len(val_serie)/len(solar_power)))
# # print('Proportion of test_set : {:.2f}%'.format(len(test_set)/len(solar_power)))

solar_power = generate_time_series(2000, seasonal_strength=np.random.randn(), trend_strength=np.random.randn(),
                                   noise_strength=0.00, scale_fac=1.0) 
train_serie = solar_power[:1500]
val_serie = solar_power[1500:]
plot_serie(train_serie)


Train Model

In [3]:

PAD = -20
def get_x_y(data, block_size):
    if len(data) > block_size:
        ten_percent_step = int(len(data)*0.1)
        idx_start = torch.arange(0, len(data)-block_size, ten_percent_step)
        # idx_start = torch.randint(len(data)-block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in idx_start])
        y = torch.stack([data[i+1:i+block_size+1] for i in idx_start])
        x_pad = (x == PAD)
        return x, y, x_pad
    else: # need to pad
        x = np.pad(data, (0, block_size-len(data)), constant_values=PAD).reshape(1, -1)# batch
        # y = data[1:].reshape(1, -1)# batch
        y = np.pad(data[1:], (0, block_size-len(data[1:])), constant_values=PAD).reshape(1, -1)# batch
        x_pad = (x == PAD)
        x[x_pad] = 0.0
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32), torch.tensor(x_pad, dtype=torch.float32)
#

In [24]:


block_size = 128
model = DecoderOnlyTransformer({
    'd_model': 16, 
    'num_heads': 4, 
    'num_layers': 4,
    'dim_feedforward':64,
    'block_size':block_size,
    'device':'cuda',
    'pad_token':PAD
}).to('cuda')
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Num of weights:',pytorch_total_params)


Num of weights: 13169


In [25]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
scaler = MinMaxScaler((0,1))
def scale(raw_serie, sc, train=False):
  if train:
    return sc.fit_transform(raw_serie.reshape(-1, 1)).reshape(-1)
  return sc.transform(raw_serie.reshape(-1, 1)).reshape(-1)

def get_deltas(x):
  return x[1:] - x[:-1]

def unscale(raw_serie, sc):
    return sc.inverse_transform(raw_serie.reshape(-1,1)).reshape(-1)

In [26]:
train_x = scale(get_deltas(train_serie), sc=scaler, train=True)
# train_x = scale(train_serie, sc=scaler, train=True)

train_x = torch.tensor(train_x, dtype=torch.float32)
x, y, m = get_x_y(train_x, block_size)
print(train_serie.shape)
print(x.shape)

(1500,)
torch.Size([10, 128])


In [33]:
epochs = 4000
lr = 0.001
train_conf = {
    'epochs':epochs,
    'lr':lr, 
    'batch_size':100,
    'verbose':True,
    'train_dataset':DecoderDataset(x.unsqueeze(-1), y.unsqueeze(-1), m),
    'save':False,
}
model.fit(train_conf) 

Starting train. 1 batches 10/100
Epoch 1/4000 [0.076secs] -> Train loss: 0.00087
Epoch 2/4000 [0.052secs] -> Train loss: 0.00362
Epoch 3/4000 [0.056secs] -> Train loss: 0.00193
Epoch 4/4000 [0.087secs] -> Train loss: 0.00208
Epoch 5/4000 [0.048secs] -> Train loss: 0.00160
Epoch 6/4000 [0.025secs] -> Train loss: 0.00184
Epoch 7/4000 [0.024secs] -> Train loss: 0.00173
Epoch 8/4000 [0.047secs] -> Train loss: 0.00134
Epoch 9/4000 [0.073secs] -> Train loss: 0.00106
Epoch 10/4000 [0.068secs] -> Train loss: 0.00128
Epoch 11/4000 [0.030secs] -> Train loss: 0.00167
Epoch 12/4000 [0.029secs] -> Train loss: 0.00132
Epoch 13/4000 [0.025secs] -> Train loss: 0.00104
Epoch 14/4000 [0.048secs] -> Train loss: 0.00098
Epoch 15/4000 [0.037secs] -> Train loss: 0.00107
Epoch 16/4000 [0.055secs] -> Train loss: 0.00099
Epoch 17/4000 [0.019secs] -> Train loss: 0.00120
Epoch 18/4000 [0.018secs] -> Train loss: 0.00111
Epoch 19/4000 [0.026secs] -> Train loss: 0.00120
Epoch 20/4000 [0.026secs] -> Train loss: 0.00

KeyboardInterrupt: 

In [35]:

pred_y = model.predict(train_x.to('cuda').view(1, -1, 1), len(val_serie))
pred_y = pred_y.cpu().numpy()
pred_y = unscale(pred_y, scaler).cumsum()
# pred_y = unscale(pred_y, scaler)

# pred_y = pred_y.cumsum()
plot_predictions(train_serie, val_serie, pred_y)

In [75]:
# from torch import nn
# with torch.no_grad():
#     src_mask = nn.Transformer.generate_square_subsequent_mask(block_size).to('cuda')
#     x = train_x[-block_size:]
#     pred_y = model(x.to('cuda').view(1, -1, 1))
#     pred_y = pred_y.cpu().numpy()
#     for i in range(block_size-1, 0, -1):
#         print(x[i], '-->', pred_y[0,i])



In [67]:
from torch import nn
@torch.no_grad()
def predict(self, src, forecast_horizon):
        self.eval()
        src_mask = nn.Transformer.generate_square_subsequent_mask(block_size).to('cuda')
        src = src[:, -self.block_size:, :].clone()
        h_len = src.shape[1]
        
        for i in range(forecast_horizon):
            x = src[:, -self.block_size:, :]
            y = self(x, mask=src_mask)[:, -1:, :]
            src = torch.concat((src, y), dim=1)
        return src[:, :, :]
            
pred_y = predict(model, train_x[-block_size:].to('cuda').view(1, -1, 1), 100)
pred_y = pred_y.cpu().numpy()
pred_y = scaler.inverse_transform(pred_y.reshape(-1,1)).reshape(-1)
plot_serie(pred_y)

In [19]:

# pred_y = model.predict(train_x.to('cuda').view(1, -1, 1), len(val_serie))
# pred_y = pred_y.cpu().numpy()
# pred_y = scaler.inverse_transform(pred_y.reshape(-1,1)).reshape(-1)
# print(pred_y.sum())
plot_predictions(train_serie[-block_size:], val_serie, pred_y)

## Transfer Learning

In [6]:


block_size = 48
PAD = -20
model = DecoderOnlyTransformer({
    'd_model': 8, 
    'num_heads': 4, 
    'num_layers': 4,
    'dim_feedforward':128,
    'block_size':block_size,
    'device':'cuda',
    'pad_token':PAD
}).to('cuda')
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Num of weights:',pytorch_total_params)


Num of weights: 10425


In [7]:
def generate_batch(block_size, n_series=10, random=False, seed=None):
    scaler = MinMaxScaler((-1, 1))
    batch_x, batch_y, batch_masks = [], [], []
    for _ in range(n_series):
        train_serie = generate_time_series(1000, seasonal_strength=np.random.randn(), trend_strength=np.random.randn(),
                                noise_strength=0.00, scale_fac=np.random.randn())            #
        train_serie = scaler.fit_transform(np.asarray(train_serie, dtype=np.float32).reshape(-1, 1)).reshape(-1)
        train_serie = torch.tensor(train_serie, dtype=torch.float32)
        x, y, x_pad = get_x_y(train_serie, block_size=block_size)
        batch_x.append(x), batch_y.append(y), batch_masks.append(x_pad)
    #
    batch_x = torch.vstack(batch_x).unsqueeze(-1).to('cuda')
    batch_y = torch.vstack(batch_y).unsqueeze(-1).to('cuda')
    batch_masks = torch.vstack(batch_masks).to('cuda')

    return batch_x, batch_y, batch_masks




In [119]:
np.random.seed(101)
for i in range(10):
    lr = 1e-4
    train_conf = {
        'epochs':1024,
        'lr':lr, 
        'batch_size':10_000,
        'verbose':True,
        'train_dataset':DecoderDataset(*generate_batch(block_size, n_series=100)),
        'save':False,
    }
    model.fit(train_conf) 


Starting train. 1 batches 1000/10000
Epoch 1/1024 [0.079secs] -> Train loss: 0.00504
Epoch 2/1024 [0.081secs] -> Train loss: 0.00490
Epoch 3/1024 [0.083secs] -> Train loss: 0.00488
Epoch 4/1024 [0.075secs] -> Train loss: 0.00490
Epoch 5/1024 [0.075secs] -> Train loss: 0.00489
Epoch 6/1024 [0.075secs] -> Train loss: 0.00487
Epoch 7/1024 [0.074secs] -> Train loss: 0.00497
Epoch 8/1024 [0.073secs] -> Train loss: 0.00490
Epoch 9/1024 [0.086secs] -> Train loss: 0.00491
Epoch 10/1024 [0.084secs] -> Train loss: 0.00483
Epoch 11/1024 [0.084secs] -> Train loss: 0.00481
Epoch 12/1024 [0.084secs] -> Train loss: 0.00484
Epoch 13/1024 [0.084secs] -> Train loss: 0.00487
Epoch 14/1024 [0.084secs] -> Train loss: 0.00478
Epoch 15/1024 [0.084secs] -> Train loss: 0.00486
Epoch 16/1024 [0.084secs] -> Train loss: 0.00484
Epoch 17/1024 [0.084secs] -> Train loss: 0.00485
Epoch 18/1024 [0.083secs] -> Train loss: 0.00478
Epoch 19/1024 [0.085secs] -> Train loss: 0.00491
Epoch 20/1024 [0.084secs] -> Train loss: 

KeyboardInterrupt: 

Test

In [127]:
N_SERIES = 1

# np.random.seed(1)

scaler = MinMaxScaler((-1,1))
for i in range(N_SERIES):
    serie = generate_time_series(1000, seasonal_strength=np.random.randn(), trend_strength=np.random.randn(),
                                noise_strength=0.00, scale_fac=np.random.randn())   

    history, future = serie[:500], serie[500:]
    x = torch.tensor(scaler.fit_transform(history.reshape(-1, 1)).reshape(-1), dtype=torch.float32).to('cuda').view(1, -1, 1)
        
    y = model.predict(x, len(future)).cpu().numpy()
    y = scaler.inverse_transform(y.reshape(-1,1)).reshape(-1)
    plot_predictions(history, future, y)

In [1]:
import numpy as np
import torch
from torch import nn
import math, time, tqdm
from torch.utils.data import DataLoader
import plotly.graph_objects as go

PAD = -777

In [2]:

# Função para gerar uma série temporal aleatória com tendência, ruído e sazonalidade
def generate_time_series(length=100, trend_strength=0.0, seasonal_strength=0.0, noise_strength=1.0, scale=2):
    time = np.arange(length)
    trend = trend_strength * time / length
    seasonal = seasonal_strength * np.sin(scale * np.pi * time / 12)
    noise = noise_strength * np.random.randn(length)
    series = trend + seasonal + noise
    return series

THEME = 'plotly_dark'
def plot_serie(x):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(0, len(x)), y=x, mode='lines',
                              name=f'Serie', line=dict(color='#0099ff'), hovertemplate='$%{y:.2f}')) # past values
    # Layout and configurations
    config = {
        'template':THEME,
        'hovermode':'x unified',
        'xaxis_rangeselector_font_color':'black',
        'legend':dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=0.9),
        }
    fig.update_layout(config)
    fig.layout.yaxis.fixedrange = True # block vertical zoom
    fig.show()


def plot_predictions(history, real_serie, pred_y):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=np.arange(0, len(history)), y=history, mode='lines',
                              name=f'History', line=dict(color='#0099ff'), hovertemplate='$%{y:.2f}')) # past values

    fig.add_trace(go.Scatter(x=np.arange(len(history), len(history)+len(real_serie)), y=real_serie, mode='lines',
                              name=f'Future', line=dict(color='yellow'), hovertemplate='$%{y:.2f}')) # real future

    fig.add_trace(go.Scatter(x=np.arange(len(history), len(history)+len(pred_y)), y=pred_y, mode='lines',
                            name='Forecasted', line=dict(color='red'), hovertemplate='$%{y:.2f}')) # predicted

    # Layout and configurations
    config = {
        'template':THEME,
        'hovermode':'x unified',
        'xaxis_rangeselector_font_color':'black',
        'legend':dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=0.9),
        }
    fig.update_layout(config)
    fig.layout.yaxis.fixedrange = True # block vertical zoom
    fig.show()



def get_x_y(data, block_size, perc_destilation=None):
  if len(data) > block_size:
    if perc_destilation is not None:
      step = int(len(data)*perc_destilation) # step
    else:
      step = 1
    idx_start = torch.arange(0, len(data)-block_size, step)
    # idx_start = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx_start])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx_start])
    x_pad = (x == PAD)
    return x, y, torch.tensor(x_pad, dtype=torch.bool)
  else: # need to pad
    x = np.pad(data, (0, block_size-len(data)), constant_values=PAD).reshape(1, -1)# batch
    # y = data[1:].reshape(1, -1)# batch
    y = np.pad(data[1:], (0, block_size-len(data[1:])), constant_values=PAD).reshape(1, -1)# batch
    x_pad = (x == PAD)
    return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32), torch.tensor(x_pad, dtype=torch.bool)


#
# Dada uma serie, gerar chunks de tamanhos aleatorios
def generate_chunks(x, min, max, block_size, samples):
  X, Y, M = [],[],[]
  for i in range(samples):
    data = sample_sequence(serie, min, max)
    x = np.pad(data, (0, block_size-len(data)), constant_values=PAD).reshape(1, -1)# batch
    y = np.pad(data[1:], (0, block_size-len(data[1:])), constant_values=PAD).reshape(1, -1)# batch
    x_pad = (x == PAD)
    X.append(x); Y.append(y); M.append(x_pad)
  X = torch.tensor(np.vstack(X), dtype=torch.float32).unsqueeze(-1)
  Y = torch.tensor(np.vstack(Y), dtype=torch.float32).unsqueeze(-1)
  M = torch.tensor(np.vstack(M), dtype=torch.float32)
  return X, Y, M
# generate_chunks(train_serie, min=10, max=13, block_size=block_size, samples=1_000)[0].shape

In [3]:
from torch.utils.data import Dataset
class DecoderDataset(Dataset):
    def __init__(self, x, y, mask):
        self.x = x
        self.y = y
        self.mask = mask

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self,idx):
        return self.x[idx], self.y[idx], self.mask[idx]

def sample_sequence(serie, min, max):
  block_size =  np.random.randint(min, max)
  idx = np.random.randint(0, len(serie)-block_size)
  return serie[idx:idx+block_size]

def sample_sequence(serie, min, max):
  block_size =  np.random.randint(min, max)
  idx = np.random.randint(0, len(serie)-min)
  return serie[idx:idx+block_size]


In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((-1,1))
def scale(raw_serie, sc, train=False):
  if train:
    return sc.fit_transform(raw_serie.reshape(-1, 1)).reshape(-1)
  return sc.transform(raw_serie.reshape(-1, 1)).reshape(-1)

def get_deltas(x):
  return x[1:] - x[:-1]

def unscale(raw_serie, sc):
    return sc.inverse_transform(raw_serie.reshape(-1,1)).reshape(-1)

In [6]:
from models.transformer import DecoderOnlyTransformer

In [7]:
scaler = MinMaxScaler((-1,1))

# generate
serie = generate_time_series(length=2000, trend_strength=0.5, seasonal_strength=0.1, noise_strength=0.0, scale=1)
# split
train_serie_raw, test_serie_raw = serie[:1500], serie[1500:]
# scale
train_serie = scale(train_serie_raw, scaler, train=True)
test_serie = scale(test_serie_raw, scaler)
# diff
train_serie = get_deltas(train_serie)
test_serie = get_deltas(test_serie)
#
#
#
block_size = 48 # bigger than train serie
model = DecoderOnlyTransformer({
    'd_model': 16,
    'num_heads': 2,
    'num_layers': 4,
    'dim_feedforward':16,
    'block_size':block_size,
    'device':'cuda',
    'pad_token': PAD
}).to('cuda')
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Num of weights:',pytorch_total_params)


Num of weights: 7601


In [9]:
train_dataset = DecoderDataset(*get_x_y(
    torch.tensor(train_serie, dtype=torch.float32), block_size))

  return x, y, torch.tensor(x_pad, dtype=torch.bool)


In [8]:
train_dataset = DecoderDataset(*
                               generate_chunks(train_serie,
                                               min=block_size//10, max=block_size,
                                               block_size=block_size,
                                               samples=1000))

In [10]:
batch_size = 1000
epochs = 100
lr = 1e-3
train_conf = {
    'epochs':epochs,
    'lr':lr,
    'batch_size':batch_size,
    'verbose':True,
    'train_dataset':train_dataset
}
model.fit(train_conf)

Starting train. 2 batches 1451/1000


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1000x48 and 1x16)

tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0',
       grad_fn=<SqueezeBackward0>)

In [25]:


y = model.predict(src, len(test_serie_raw)).cpu().numpy()
y = y.cumsum()
y = unscale(y, scaler)
plot_predictions(train_serie_raw, test_serie_raw, y)

In [34]:
m = DecoderOnlyTransformer({
    'd_model': 16,
    'num_heads': 2,
    'num_layers': 4,
    'dim_feedforward':16,
    'block_size':block_size,
    'device':'cuda',
    'pad_token': PAD
}).to('cuda')
m = m.eval()

In [137]:
model = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=1,
                nhead=1, 
                dim_feedforward=12,
                dropout=0, batch_first=True, device='cuda'), 
        2, norm=None)