In [1]:
import torch
from torch.nn import LSTM
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.func import functional_call
from torch.nn.functional import normalize

import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

from dataset import TSDataset
from models import LSTMModel, AttentionModel
from train_utils import train_and_test
from neural_memory import NeuralMemory
from titans import MACTitanLayer, MACTitan

from tqdm import tqdm
from copy import deepcopy

torch.set_grad_enabled(True)

# ===========================================================================================

train_data = TSDataset('sinwave', 'train', False)
valid_data = TSDataset('sinwave', 'valid', False)
test_data = TSDataset('sinwave', 'test', False)

_train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
_valid_loader = DataLoader(valid_data, batch_size=4, shuffle=True)
_test_loader = DataLoader(test_data, batch_size=4, shuffle=True)

In [23]:
model = LSTMModel(1, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [00:24<00:00,  2.07it/s]


 Testing the best model:
Test MSE: 0.015290285992835249






LSTMModel(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (relu): ReLU()
  (lstm): LSTM(16, 16, batch_first=True)
  (final_layer): Linear(in_features=16, out_features=1, bias=True)
)

In [24]:
model = AttentionModel(1, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [06:38<00:00,  7.97s/it]



 Testing the best model:
Test MSE: 0.03654162958264351



AttentionModel(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (att_layers): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (activation): SiLU()
    )
  )
  (final_layer): Linear(in_features=256, out_features=1, bias=True)
)

In [25]:
model = MACTitan(
    input_dim = 1,
    hidden_dim = 16,
    output_dim = 1,
    context_window = 16,
    pm_len = 4,
    n_layers = 2,
    n_layers_nmm = 2,
    eta=0.9,
    theta=0.01
).cuda()

optimizer = optim.Adam(model.outer_params)

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 25)

Training:


  4%|▍         | 1/25 [01:04<25:40, 64.17s/it]

Validation loss in Epoch 0: 0.0464873415018831


  8%|▊         | 2/25 [02:09<24:49, 64.75s/it]

Validation loss in Epoch 1: 0.0319312254765204


 12%|█▏        | 3/25 [03:14<23:48, 64.93s/it]

Validation loss in Epoch 2: 0.02681960021810872


 16%|█▌        | 4/25 [04:20<22:48, 65.18s/it]

Validation loss in Epoch 3: 0.02354222722351551


 20%|██        | 5/25 [05:24<21:40, 65.00s/it]

Validation loss in Epoch 4: 0.021766939333506992


 24%|██▍       | 6/25 [06:30<20:37, 65.15s/it]

Validation loss in Epoch 5: 0.021335237632904736


 28%|██▊       | 7/25 [07:34<19:30, 65.02s/it]

Validation loss in Epoch 6: 0.02023250758647919


 32%|███▏      | 8/25 [08:40<18:28, 65.18s/it]

Validation loss in Epoch 7: 0.02087954490312508


 36%|███▌      | 9/25 [09:45<17:22, 65.15s/it]

Validation loss in Epoch 8: 0.0198475859527077


 40%|████      | 10/25 [10:50<16:17, 65.13s/it]

Validation loss in Epoch 9: 0.01949222667941025


 44%|████▍     | 11/25 [11:55<15:10, 65.04s/it]

Validation loss in Epoch 10: 0.019479639349239214


 48%|████▊     | 12/25 [13:00<14:03, 64.90s/it]

Validation loss in Epoch 11: 0.019049335537212237


 52%|█████▏    | 13/25 [14:05<12:59, 64.99s/it]

Validation loss in Epoch 12: 0.018819607155663626


 56%|█████▌    | 14/25 [15:10<11:55, 65.06s/it]

Validation loss in Epoch 13: 0.019604248819606644


 60%|██████    | 15/25 [16:15<10:50, 65.03s/it]

Validation loss in Epoch 14: 0.019615619948932104


 64%|██████▍   | 16/25 [17:20<09:44, 64.91s/it]

Validation loss in Epoch 15: 0.018826325450624737


 68%|██████▊   | 17/25 [18:27<08:46, 65.81s/it]

Validation loss in Epoch 16: 0.0190133236348629


 72%|███████▏  | 18/25 [19:32<07:38, 65.54s/it]

Validation loss in Epoch 17: 0.019050435509000506


 76%|███████▌  | 19/25 [20:38<06:32, 65.43s/it]

Validation loss in Epoch 18: 0.01891714587275471


 80%|████████  | 20/25 [21:42<05:26, 65.25s/it]

Validation loss in Epoch 19: 0.018840320780873297


 84%|████████▍ | 21/25 [22:47<04:20, 65.14s/it]

Validation loss in Epoch 20: 0.018297713782106127


 88%|████████▊ | 22/25 [23:52<03:15, 65.16s/it]

Validation loss in Epoch 21: 0.018687697499990462


 92%|█████████▏| 23/25 [24:58<02:10, 65.17s/it]

Validation loss in Epoch 22: 0.01859079035265105


 96%|█████████▌| 24/25 [26:03<01:05, 65.23s/it]

Validation loss in Epoch 23: 0.019545881130865642


100%|██████████| 25/25 [27:08<00:00, 65.12s/it]

Validation loss in Epoch 24: 0.018531270511448383

 Testing the best model:





Test MSE: 0.01826440165085452



MACTitan(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (layers): ModuleList(
    (0-1): 2 x MACTitanLayer(
      (att_layer): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (activation): SiLU()
      )
      (Q): Linear(in_features=16, out_features=16, bias=True)
      (nm_module): NeuralMemory(
        (layers): ModuleList(
          (0): Sequential(
            (0): Linear(in_features=16, out_features=32,