In [4]:
import numpy as np
import wandb
import time
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt 

Data Preprocessing

In [37]:
out = np.load('training-val-test-data.npz')
th_train = out['th']
u_train = out['u']

def create_IO_data(u,y,na,nb):
    X = []
    Y = []
    for k in range(max(na,nb), len(y)):
        X.append(np.concatenate([u[k-nb:k],y[k-na:k]]))
        Y.append(y[k])
    return np.array(X), np.array(Y)

na = 11
nb = 10
X, Y = create_IO_data(u_train, th_train, na, nb)

# Split into train/val/test
Xtemp, Xtest_final, Ytemp, Ytest_final = train_test_split(X, Y, test_size=0.15, random_state=42)
Xtrain_final, Xval, Ytrain_final, Yval = train_test_split(Xtemp, Ytemp, test_size=0.1765, random_state=42)
Xtrain_final = np.expand_dims(Xtrain_final, axis=-1)
Xval = np.expand_dims(Xval, axis=-1)

In [38]:
print(Xtrain_final[0].shape)


(21, 1)


The LSTM

In [13]:
class NOELSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(NOELSTM, self).__init__()
        # self.hidden_size = hidden_size
        # self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True).double()
        self.fc = nn.Linear(hidden_size, output_size).double()

    def forward(self, x):

        out, _ = self.lstm(x)            # out: (batch_size, seq_len, hidden_size)
        out = self.fc(out[:, -1, :])     # Take output from the last time step
        return out

Training the network

In [39]:
# Initialize W&B
wandb.init(
    project="5SC28",
    name="LSTM 32 size 1 layers n = 11-10",
    config={
        "input_size": 1,
        "hidden_size":32,
        "num_layers":1,
        "output_size":1,
        "learning_rate": 0.01,
        "epochs": 1000,
        "batch_size": 256,
        "batched": True,
        "lr_decay": "plateau",         # options: "step", "plateau"        
        "lr_gamma": 0.5,
        "lr_step_size": 500
    }
)

train_losses = []
val_losses = []
train_NRMS_list = []
val_NRMS_list = []

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model setup
model = NOELSTM(wandb.config.input_size, wandb.config.hidden_size, wandb.config.num_layers, wandb.config.output_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
wandb.watch(model, log="all")

# Learning rate scheduler
if wandb.config.lr_decay == "step":
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=wandb.config.lr_step_size,
        gamma=wandb.config.lr_gamma
    )
elif wandb.config.lr_decay == "plateau":
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=wandb.config.lr_gamma,
        patience=20,
    )
else:
    scheduler = None

# Convert to torch tensors
Xtrain_tensor = torch.tensor(Xtrain_final).to(device)
Ytrain_tensor = torch.tensor(Ytrain_final).to(device)
Xval_tensor = torch.tensor(Xval).to(device)
Yval_tensor = torch.tensor(Yval).to(device)
Xtest_tensor = torch.tensor(Xtest_final).to(device)
Ytest_tensor = torch.tensor(Ytest_final).to(device)

# y_mean = y_tensor.mean()
# y_std = y_tensor.std()
# u_tensor = (u_tensor - u_tensor.mean()) / u_tensor.std()
# y_tensor = (y_tensor - y_mean) / y_std

grads = []
t_start = time.time()

# Training loop
for epoch in range(wandb.config.epochs):
    model.train()
    if wandb.config.batched:
        for i in range(0, len(Xtrain_tensor) - wandb.config.batch_size, wandb.config.batch_size):
            optimizer.zero_grad()
            y = model(Xtrain_tensor[i:i + wandb.config.batch_size])
            loss = torch.mean((y - Ytrain_tensor[i:i + wandb.config.batch_size]) ** 2)
            loss.backward()
            optimizer.step()
            batch_grads = [torch.mean(par.grad**2).item()**0.5 for par in list(model.parameters())[::2]]
            grads.append(batch_grads)
    else:
        optimizer.zero_grad()
        y = model(Xtrain_tensor)
        loss = torch.mean((y - Ytrain_tensor) ** 2)
        loss.backward()
        optimizer.step()
        batch_grads = [torch.mean(par.grad**2).item()**0.5 for par in list(model.parameters())[::2]]
        grads.append(batch_grads)

    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = model(Xval_tensor)
        val_loss = torch.mean((val_pred - Yval_tensor) ** 2)
        val_nrms = val_loss.item() / torch.std(Yval_tensor).item()

    # # Compute NRMS
    # NRMS = Loss.item() / torch.std(y_tensor).item()

    # if scheduler:
    #     if wandb.config.lr_decay == "plateau":
    #         scheduler.step(val_loss.item())
    #     else:
    #         scheduler.step()
    #     current_lr = scheduler.get_last_lr()[0]
    # else:
    #     current_lr = wandb.config.learning_rate

    train_nrms = loss.item() / torch.std(Ytrain_tensor).item()

    train_losses.append(loss.item())
    val_losses.append(val_loss.item())
    train_NRMS_list.append(train_nrms)
    val_NRMS_list.append(val_nrms)

    wandb.log({
        "epoch": epoch,
        "train_loss": loss.item(),
        "train_NRMS": train_nrms,
        "val_loss": val_loss.item(),
        "val_NRMS": val_nrms,
        "updates_per_sec": (epoch + 1) / (time.time() - t_start),
        "learning_rate": wandb.config.learning_rate
    })

    # Print progress
    if epoch % 100 == 0:
        print(f"Epoch {epoch}/{wandb.config.epochs} | Train NRMS: {train_nrms:.5f} | Val NRMS: {val_nrms:.5f} | LR: {current_lr:.6f}")
        print(f"{(epoch + 1) / (time.time() - t_start):.2f} updates/sec")

# Save model
# torch.save(model.state_dict(), "model_val.pt")
# wandb.save("model_val.pt")

# Finish W&B session
wandb.finish()

Using device: cuda
Epoch 0/1000 | Train NRMS: 0.46251 | Val NRMS: 0.47640 | LR: 0.000000
1.29 updates/sec
Epoch 100/1000 | Train NRMS: 0.46089 | Val NRMS: 0.47483 | LR: 0.000000
1.53 updates/sec
Epoch 200/1000 | Train NRMS: 0.46089 | Val NRMS: 0.47483 | LR: 0.000000
1.45 updates/sec
Epoch 300/1000 | Train NRMS: 0.46089 | Val NRMS: 0.47483 | LR: 0.000000
1.43 updates/sec
Epoch 400/1000 | Train NRMS: 0.46089 | Val NRMS: 0.47483 | LR: 0.000000
1.43 updates/sec
Epoch 500/1000 | Train NRMS: 0.46089 | Val NRMS: 0.47483 | LR: 0.000000
1.46 updates/sec


KeyboardInterrupt: 

In [40]:
wandb.finish()

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_NRMS,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
updates_per_sec,▇███▇▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_NRMS,▁▂▅█▂▂▂▃▃▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
val_loss,█▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,546.0
learning_rate,0.01
train_NRMS,0.46089
train_loss,0.22115
updates_per_sec,1.4506
val_NRMS,0.47483
val_loss,0.22541
