In [3]:
import torch
from torch.nn import LSTM
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.func import functional_call
from torch.nn.functional import normalize

import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

from dataset import TSDataset
from models import LSTMModel, AttentionModel, AttentionPMModel
from train_utils import train_and_test
from neural_memory import NeuralMemory
from titans import MACTitanLayer, MACTitan

from tqdm import tqdm
from copy import deepcopy

torch.set_grad_enabled(True)

# ===========================================================================================

<torch.autograd.grad_mode.set_grad_enabled at 0x7f1154bb2d10>

In [10]:
train_data = TSDataset('sinwave', 'train', False)
valid_data = TSDataset('sinwave', 'valid', False)
test_data = TSDataset('sinwave', 'test', False)

_train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
_valid_loader = DataLoader(valid_data, batch_size=4, shuffle=True)
_test_loader = DataLoader(test_data, batch_size=4, shuffle=True)

In [12]:
model = LSTMModel(1, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [00:24<00:00,  2.05it/s]


 Testing the best model:
Test MSE: 0.015617506046380316




  result = _VF.lstm(


LSTMModel(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (relu): ReLU()
  (lstm): LSTM(16, 16, batch_first=True)
  (final_layer): Linear(in_features=16, out_features=1, bias=True)
)

In [13]:
model = AttentionModel(1, 16, 16)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [06:32<00:00,  7.85s/it]



 Testing the best model:
Test MSE: 0.026209833260093417



AttentionModel(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (att_layers): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (activation): SiLU()
    )
  )
  (final_layer): Linear(in_features=256, out_features=1, bias=True)
)

In [15]:
model = AttentionPMModel(1, 16, 16, 4)

optimizer = optim.Adam(model.parameters())

train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 50)

Training:


100%|██████████| 50/50 [07:57<00:00,  9.55s/it]



 Testing the best model:
Test MSE: 0.02737562262586185



AttentionPMModel(
  (emb_layer): Linear(in_features=1, out_features=16, bias=True)
  (att_layers): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
      )
      (linear1): Linear(in_features=16, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=16, bias=True)
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (activation): SiLU()
    )
  )
  (final_layer): Linear(in_features=320, out_features=1, bias=True)
)

In [16]:
model = MACTitan(
    input_dim = 1,
    hidden_dim = 16,
    output_dim = 1,
    context_window = 16,
    pm_len = 4,
    n_layers = 2,
    n_layers_nmm = 2,
    eta=0.9,
    theta=0.01
).cuda()

optimizer = optim.Adam(model.outer_params)

best_model = train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 10, True)

Training:


 10%|█         | 1/10 [01:03<09:34, 63.80s/it]

Validation loss in Epoch 0: 0.04679072509918894


 20%|██        | 2/10 [02:08<08:35, 64.41s/it]

Validation loss in Epoch 1: 0.0287671905543123


 30%|███       | 3/10 [03:13<07:31, 64.49s/it]

Validation loss in Epoch 2: 0.023503007260816437


 40%|████      | 4/10 [04:18<06:27, 64.66s/it]

Validation loss in Epoch 3: 0.023342822811433246


 50%|█████     | 5/10 [05:23<05:24, 64.82s/it]

Validation loss in Epoch 4: 0.020683650140251433


 60%|██████    | 6/10 [06:27<04:18, 64.68s/it]

Validation loss in Epoch 5: 0.021096062287688254


 70%|███████   | 7/10 [07:32<03:13, 64.66s/it]

Validation loss in Epoch 6: 0.020785312088472504


 80%|████████  | 8/10 [08:37<02:09, 64.81s/it]

Validation loss in Epoch 7: 0.019927816305841718


 90%|█████████ | 9/10 [09:42<01:04, 64.77s/it]

Validation loss in Epoch 8: 0.019043979048728944


100%|██████████| 10/10 [10:46<00:00, 64.66s/it]

Validation loss in Epoch 9: 0.018900840649647373

 Testing the best model:





Test MSE: 0.018850508385470935



In [18]:
model = MACTitan(
    input_dim = 1,
    hidden_dim = 16,
    output_dim = 1,
    context_window = 16,
    pm_len = 4,
    n_layers = 2,
    n_layers_nmm = 1,
    eta=0.9,
    theta=0.01
).cuda()

optimizer = optim.Adam(model.outer_params)

best_model = train_and_test(model, optimizer, _train_loader, _valid_loader, _test_loader, 15, True)

Training:


  7%|▋         | 1/15 [00:57<13:19, 57.11s/it]

Validation loss in Epoch 0: 0.0434265883905547


 13%|█▎        | 2/15 [01:54<12:24, 57.26s/it]

Validation loss in Epoch 1: 0.03195581351007734


 20%|██        | 3/15 [02:52<11:32, 57.73s/it]

Validation loss in Epoch 2: 0.027463582211307118


 27%|██▋       | 4/15 [03:51<10:37, 57.97s/it]

Validation loss in Epoch 3: 0.029187180740492685


 33%|███▎      | 5/15 [04:50<09:44, 58.40s/it]

Validation loss in Epoch 4: 0.02582288437655994


 40%|████      | 6/15 [05:49<08:46, 58.53s/it]

Validation loss in Epoch 5: 0.022780442610383034


 47%|████▋     | 7/15 [06:47<07:48, 58.60s/it]

Validation loss in Epoch 6: 0.019899009859987667


 53%|█████▎    | 8/15 [07:46<06:50, 58.58s/it]

Validation loss in Epoch 7: 0.019312060943671636


 60%|██████    | 9/15 [08:44<05:51, 58.53s/it]

Validation loss in Epoch 8: 0.021618760536823954


 67%|██████▋   | 10/15 [09:42<04:51, 58.36s/it]

Validation loss in Epoch 9: 0.019004022436482564


 73%|███████▎  | 11/15 [10:40<03:52, 58.24s/it]

Validation loss in Epoch 10: 0.019028938029493604


 80%|████████  | 12/15 [11:38<02:54, 58.12s/it]

Validation loss in Epoch 11: 0.019625601917505266


 87%|████████▋ | 13/15 [12:36<01:55, 57.94s/it]

Validation loss in Epoch 12: 0.018792823010257313


 93%|█████████▎| 14/15 [13:33<00:57, 57.82s/it]

Validation loss in Epoch 13: 0.0192281288760049


100%|██████████| 15/15 [14:31<00:00, 58.07s/it]

Validation loss in Epoch 14: 0.019239492767623493

 Testing the best model:





Test MSE: 0.018783531337976455

