In [3]:
import torch
from torch.nn import LSTM
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.func import functional_call
from torch.nn.functional import normalize

import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

from dataset import TSDataset
from models import LSTMModel, AttentionModel, AttentionPMModel
from train_utils import train_and_test
from neural_memory import NeuralMemory
from titans import MACTitanLayer, MACTitan

from tqdm import tqdm
from copy import deepcopy

torch.set_grad_enabled(True)

# ===========================================================================================

<torch.autograd.grad_mode.set_grad_enabled at 0x7f1154bb2d10>

In [5]:
train_data = TSDataset('weather', 'train', False)
valid_data = TSDataset('weather', 'valid', False)
test_data = TSDataset('weather', 'test', False)

_train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
_valid_loader = DataLoader(valid_data, batch_size=4, shuffle=True)
_test_loader = DataLoader(test_data, batch_size=4, shuffle=True)

In [4]:
model = LSTMModel(5, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [00:37<00:00,  1.32it/s]


 Testing the best model:
Test MSE: 0.035601225142416204




  result = _VF.lstm(


LSTMModel(
  (emb_layer): Linear(in_features=5, out_features=16, bias=True)
  (relu): ReLU()
  (lstm): LSTM(16, 16, batch_first=True)
  (final_layer): Linear(in_features=16, out_features=1, bias=True)
)

In [5]:
model = AttentionModel(5, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


  indices = torch.from_numpy(sliding_window_view(np.arange(total_len), self.seq_len))
100%|██████████| 50/50 [09:42<00:00, 11.64s/it]



 Testing the best model:
Test MSE: 0.03525110926283033



AttentionModel(
  (emb_layer): Linear(in_features=5, out_features=16, bias=True)
  (att_layers): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (activation): SiLU()
    )
  )
  (final_layer): Linear(in_features=256, out_features=1, bias=True)
)

In [6]:
model = AttentionPMModel(5, 16, 16, 4)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [11:45<00:00, 14.11s/it]



 Testing the best model:
Test MSE: 0.03569375570667417



AttentionPMModel(
  (emb_layer): Linear(in_features=5, out_features=16, bias=True)
  (att_layers): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (activation): SiLU()
    )
  )
  (final_layer): Linear(in_features=320, out_features=1, bias=True)
)

In [6]:
model = MACTitan(
    input_dim = 5,
    hidden_dim = 16,
    output_dim = 1,
    context_window = 16,
    pm_len = 4,
    n_layers = 2,
    n_layers_nmm = 2,
    eta=0.9,
    theta=0.01
).cuda()

optimizer = optim.Adam(model.outer_params)

best_model = train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 10, True)

Training:


  stz = torch.from_numpy(x[:,:-residual].reshape(x.shape[0], -1, self.context_window, self.context_window, self.input_dim)).cuda()
 10%|█         | 1/10 [01:35<14:18, 95.44s/it]

Validation loss in Epoch 0: 0.03467170451032488


 20%|██        | 2/10 [03:10<12:40, 95.01s/it]

Validation loss in Epoch 1: 0.03373166181539235


 30%|███       | 3/10 [04:46<11:09, 95.64s/it]

Validation loss in Epoch 2: 0.03181907081682431


 40%|████      | 4/10 [06:20<09:31, 95.17s/it]

Validation loss in Epoch 3: 0.02920794567387355


 50%|█████     | 5/10 [07:55<07:54, 94.95s/it]

Validation loss in Epoch 4: 0.02990819851034566


 60%|██████    | 6/10 [09:29<06:18, 94.72s/it]

Validation loss in Epoch 5: 0.03043873063043544


 70%|███████   | 7/10 [11:03<04:42, 94.30s/it]

Validation loss in Epoch 6: 0.032751848470223575


 80%|████████  | 8/10 [12:37<03:08, 94.23s/it]

Validation loss in Epoch 7: 0.032473287613768324


 90%|█████████ | 9/10 [14:11<01:34, 94.10s/it]

Validation loss in Epoch 8: 0.036109270429924914


100%|██████████| 10/10 [15:45<00:00, 94.57s/it]

Validation loss in Epoch 9: 0.03277809196396878

 Testing the best model:





Test MSE: 0.03569832993181128



In [7]:
model = MACTitan(
    input_dim = 5,
    hidden_dim = 16,
    output_dim = 1,
    context_window = 16,
    pm_len = 4,
    n_layers = 2,
    n_layers_nmm = 1,
    eta=0.9,
    theta=0.01
).cuda()

optimizer = optim.Adam(model.outer_params)

best_model = train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 10, True)

Training:


 10%|█         | 1/10 [01:24<12:41, 84.57s/it]

Validation loss in Epoch 0: 0.031554996830068134


 20%|██        | 2/10 [02:50<11:24, 85.60s/it]

Validation loss in Epoch 1: 0.02973860088539751


 30%|███       | 3/10 [04:20<10:10, 87.28s/it]

Validation loss in Epoch 2: 0.03255043621910246


 40%|████      | 4/10 [05:45<08:38, 86.48s/it]

Validation loss in Epoch 3: 0.028653166717604588


 50%|█████     | 5/10 [07:09<07:08, 85.68s/it]

Validation loss in Epoch 4: 0.0312347532495072


 60%|██████    | 6/10 [08:34<05:41, 85.38s/it]

Validation loss in Epoch 5: 0.03278367313507356


 70%|███████   | 7/10 [09:59<04:15, 85.19s/it]

Validation loss in Epoch 6: 0.02949460878183967


 80%|████████  | 8/10 [11:24<02:50, 85.25s/it]

Validation loss in Epoch 7: 0.03408910743892193


 90%|█████████ | 9/10 [12:49<01:25, 85.22s/it]

Validation loss in Epoch 8: 0.03377002207072158


100%|██████████| 10/10 [14:15<00:00, 85.50s/it]

Validation loss in Epoch 9: 0.03529415703133533

 Testing the best model:





Test MSE: 0.037331398498070864

