# Debug on curve simulated

- 2 output dimensions (full matrix model)
- would probably profit from a val set, but for GPs we don't use this

Tried & didn't work:
- Larger network did not help
- H no sum() is invalid
- dropout at 0.1 or 0.3
- gelu
- higher batch size

Worked:
- silu! is also better than geLU
- Larger learning rate! Otherwise it gets stuck!
- lower patience and larger weight decay help too
- Model can "fit" test data so works
- In 2D every divergence free vector field is symplectic?

In [1]:
import torch.nn as nn 
import torch

class dfNN(nn.Module):
    def __init__(self, input_dim = 2, hidden_dim = 32):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = 1  # Scalar potential

        # HACK: SiLu() worked much better than ReLU() for this model

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, self.output_dim),
        )

    def forward(self, x):
        """
        Turn x1, x2 locations into vector fields
        x: [batch_size, input_dim]
        Returns: [batch_size, input_dim]  # Symplectic gradient
        """
        # Retrieve scalar potential
        H = self.net(x)

        partials = torch.autograd.grad(
                outputs = H.sum(), # we can sum here because every H row only depend on every x row
                inputs = x,
                create_graph = True
            )[0]
        
        # Symplectic gradient
        # flip columns (last dim) for x2, x1 order. Multiply x2 by -1
        symp = partials.flip(-1) * torch.tensor([1, -1], dtype = torch.float32, device = x.device)

        # return symp, H # NOTE: return H as well if we want to see what is going on
        return symp

In [45]:
class dfNN_matrix(nn.Module):
    def __init__(self, input_dim = 2, hidden_dim = 32):
        super().__init__()
        self.input_dim = input_dim

        # NOTE: different
        # for 2D input the NN output dim is now 4 (2x2) in the full matrix case
        self.output_dim = int((input_dim * input_dim))

        # HACK: SiLu() worked much better than ReLU() for this model

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, self.output_dim),
        )

    def forward(self, x):
        # put deterministic transformations here with torch functional
        # more computation but performs better!?

        def A(x):

            # RUN THROUGH NET
            M = self.net(x)

            # make square
            M = M.view(self.input_dim, self.input_dim)

            # construct an Anti-symmetric matrix from the output of the NN
            # the diagonal of the matrix M is irrelevant technically
            A = M - M.mT

            return A
        
        # torch.diagonal(jacfwd(A)(x), dim1 = 1, dim2 = 2).sum(-1)

        return jacfwd(A)(x), torch.diagonal(jacfwd(A)(x), dim1 = 1, dim2 = 2), torch.diagonal(jacfwd(A)(x), dim1 = 1, dim2 = 2).sum(-1)

In [84]:
import torch
import torch.nn as nn

class dfNN_matrix(nn.Module):
    def __init__(self, input_dim = 2, hidden_dim = 32):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = input_dim * input_dim  # flattened 2x2 matrix

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, self.output_dim),
        )

    def forward(self, x):
        x.requires_grad_(True)

        M = self.net(x).view(-1, 2, 2)
        # Make antisymmetric matrix
        A = M - M.transpose(1, 2)

        # picks u
        u = A[:, 0, 1]  # since A = [[0, u], [-u, 0]]

        du_dx = torch.autograd.grad(u.sum(), x, create_graph = True)[0]  # [B, 2]
        print(du_dx)

        symplectic = du_dx.flip(-1) * torch.tensor([1.0, -1.0], device = x.device)

        return symplectic  # or return du_dx.sum(dim=1) for divergence-style scalar

In [71]:
import torch
import torch.nn as nn

class dfNN_matrix(nn.Module):
    def __init__(self, input_dim = 2, hidden_dim = 32):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = input_dim * input_dim  # flattened 2x2 matrix

        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, self.output_dim),
        )

    def forward(self, x):
        """
        x: Tensor of shape [batch_size, input_dim]
        Returns: batch of scalar values representing trace of Jacobian of anti-symmetric A(x)
        """

        batch_size = x.size(0)
        x.requires_grad_(True)

        # Pass input through network and reshape output to matrix
        M = self.net(x)  # [batch_size, output_dim]
        M = M.view(batch_size, self.input_dim, self.input_dim)  # [B, 2, 2]

        # Construct anti-symmetric matrix A = M - M^T
        A = M - M.transpose(1, 2)  # [B, 2, 2]
        # NOTE: A's first row is [0, U] and second row is [U, 0]

        print(A)

        # Compute Jacobian of A with respect to input x
        # Result: [B, input_dim, input_dim, input_dim]
        J = []
        for i in range(self.input_dim):

            grad_outputs = torch.zeros_like(A)
            # Select row (has only one non-zero element)
            grad_outputs[:, i, :] = 1.0  # dA[i, :] / dx

            grad = torch.autograd.grad(
                outputs = A,
                inputs = x,
                grad_outputs = grad_outputs,
                create_graph = True,
            )[0]
            print(grad)
            J.append(grad)

        # Append row-wise
        # Stack to get full Jacobian: shape [B, output_dim (rows*cols), input_dim]
        J = torch.stack(J, dim = 1)  # [B, input_dim, input_dim]

        print(J)
        
        # Return trace of the diagonal Jacobian blocks: sum of dA_ii/dx_i
        trace = torch.diagonal(J, dim1 = 1, dim2 = 2)  # [B]
        print(trace)

        return trace

In [5]:
from utils import set_seed, make_grid
from configs import N_SIDE
_, x_test = make_grid(N_SIDE)

In [82]:
dfNN_matrix_model = dfNN_matrix()
dfNN_matrix_model.train()

x_test = x_test.requires_grad_()
# A, B, C = dfNN_matrix_model(x_test[0])
C = dfNN_matrix_model(x_test[0].unsqueeze(0))

tensor([[0., 0.]], grad_fn=<MmBackward0>)


In [83]:
C

tensor([[0., -0.]], grad_fn=<MulBackward0>)

In [24]:
from torch.func import jacfwd

torch.diagonal(jacfwd(A)(x_test), dim1 = 1, dim2 = 2).sum(-1)

TypeError: 'Tensor' object is not callable

In [None]:
model_name = "dfNN"

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
# also import x_test grid size and std noise for training data
from configs import N_SIDE, STD_GAUSSIAN_NOISE

# Reiterating import for visibility
MAX_NUM_EPOCHS = MAX_NUM_EPOCHS
MAX_NUM_EPOCHS = 4000
NUM_RUNS = NUM_RUNS
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# TODO: Delete overwrite, run full
NUM_RUNS = 1

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_SIM_LEARNING_RATE")
MODEL_SIM_RESULTS_DIR = getattr(configs, f"{model_name}_SIM_RESULTS_DIR")
import os
os.makedirs(MODEL_SIM_RESULTS_DIR, exist_ok = True)

# for all models with NN components train on batches
if model_name in ["dfNGP", "dfNN", "PINN"]:
    from configs import BATCH_SIZE
    from torch.utils.data import DataLoader, TensorDataset

if model_name in ["dfNGP", "dfNN"]:
    from NN_models import dfNN

# universals 
from metrics import compute_RMSE, compute_MAE, compute_divergence_field

# basics
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# utilitarian
from utils import set_seed, make_grid
# reproducibility
set_seed(42)
import gc

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

### START TIMING ###
import time
start_time = time.time()  # Start timing after imports

### SIMULATION ###
# Import all simulation functions
from simulate import (
    simulate_detailed_curve,
)

# Define simulations as a dictionary with names as keys to function objects
# alphabectic order here
simulations = {
    "curve": simulate_detailed_curve,
}

########################
### x_train & x_test ###
########################

# Load training inputs (once for all simulations)
x_train = torch.load("data/sim_data/x_train_lines_discretised_0to1.pt", weights_only = False).float()

# Generate x_test (grid) once for all simulations
x_test_grid, x_test = make_grid(N_SIDE)
# x_test is long format (N_SIDE ** 2, 2)

#################################
### LOOP 1 - over SIMULATIONS ###
#################################

# Make y_train_dict: Iterate over all simulation functions
for sim_name, sim_func in simulations.items():

    ########################
    ### y_train & y_test ###
    ########################

    # Generate training observations
    # NOTE: sim_func() needs to be on CPU, so we move x_train to CPU
    y_train = sim_func(x_train.cpu()).to(device)
    y_test = sim_func(x_test.cpu()).to(device)
    
    x_test = x_test.to(device)
    x_train = x_train.to(device)
    
    # Print details
    print(f"=== {sim_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print(f"Training inputs device: {y_train.device}")
    print(f"Training observations device: {y_train.device}")
    print()

    # Print details
    print(f"=== {sim_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print(f"Test inputs device: {x_test.device}")
    print(f"Test observations device: {y_test.device}")
    print()

    # NOTE: This is different to the real data experiments
    # calculate the mean magnitude of the test data as we use this to scale the noise
    sim_mean_magnitude_for_noise = torch.norm(y_test, dim = -1).mean().to(device)
    sim_noise = STD_GAUSSIAN_NOISE * sim_mean_magnitude_for_noise

    # Store metrics for the simulation (used for *metrics_summary* report and *metrics_per_run*)
    simulation_results = [] 

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Add Noise before data loader is defined
        y_train_noisy = y_train.to(device) + (torch.randn(y_train.shape, device = device) * sim_noise)

        # convert to DataLoader for batching
        # NOTE: For the simulated experiments we use noisy data
        dataset = TensorDataset(x_train, y_train_noisy)
        # now giving it all test data to see if it converges then
        # dataset = TensorDataset(x_test, y_test)
        dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True)

        # initialise new model for run (seeded so this is reproducible)
        dfNN_model = dfNN().to(device)
        dfNN_model.train()

        # define loss function (MSE for regression)
        criterion = torch.nn.MSELoss()

        # AdamW as optimizer for some regularisation/weight decay
        optimizer = optim.AdamW(dfNN_model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)

        # _________________
        # BEFORE EPOCH LOOP
        
        # Export the convergence just for first run only
        if run == 0:
            # initialise tensors to store losses over epochs (for convergence plot)
            train_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # by-product
            test_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # accumulate losses over batches for each epoch 
            train_losses_RMSE_over_batches = 0.0

            #############################
            ### LOOP 4 - over BATCHES ###
            #############################

            for batch in dataloader:

                # set model to training mode
                dfNN_model.train()

                x_batch, y_batch = batch
                # put on GPU if available
                # NOTE: requires_grad_() is used to compute gradients for the input
                x_batch, y_batch = x_batch.to(device).requires_grad_(), y_batch.to(device)

                # Forward pass
                # NOTE: We used to do this with vmaps, but now we do it with the model directly (not faster)
                y_pred_batch, _ = dfNN_model(x_batch)

                # Compute loss (RMSE for same units as data) 
                loss = torch.sqrt(criterion(y_pred_batch, y_batch)) 

                # Add losses to the epoch loss (over batches)
                train_losses_RMSE_over_batches += loss.item()

                # backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            ###############################
            ### END LOOP 4 over BATCHES ###
            ###############################
            
            # for every epoch...

            dfNN_model.eval()

            # Compute average loss for the epoch (e.g. 7 batches / epoch)
            avg_train_loss_RMSE_for_epoch = train_losses_RMSE_over_batches / len(dataloader)

            # Print for epoch
            print(f"{sim_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (RMSE): {avg_train_loss_RMSE_for_epoch:.4f}")

            # Early stopping check
            if avg_train_loss_RMSE_for_epoch < best_loss:
                best_loss = avg_train_loss_RMSE_for_epoch
                epochs_no_improve = 0  # reset counter
                best_model_state = dfNN_model.state_dict()  # save best model
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break

            # For Run 1 we save a bunch of metrics, while for the rest we only update (above)
            if run == 0:

                # Train 
                # NOTE: We do this again because we want to pass through the full dataset, not just batches
                y_train_pred, H_train = dfNN_model(x_train.to(device).requires_grad_())
                # Compute train loss for loss convergence plot
                train_rmse_loss = torch.sqrt(criterion(y_train_pred, y_train.to(device))).item()
                # TODO: Maybe detach here

                # Test 
                # No batches, but full dataset
                y_test_pred, H_test = dfNN_model(x_test.to(device).requires_grad_())
                test_rmse_loss = torch.sqrt(criterion(y_test_pred, y_test.to(device))).item()

                train_losses_RMSE_over_epochs[epoch] = train_rmse_loss
                test_losses_RMSE_over_epochs[epoch] = test_rmse_loss

        ##############################
        ### END LOOP 3 over EPOCHS ###
        ##############################

Using device: cuda

=== CURVE ===
Training inputs shape: torch.Size([196, 2])
Training observations shape: torch.Size([196, 2])
Training inputs dtype: torch.float32
Training inputs device: cuda:0
Training observations device: cuda:0

=== CURVE ===
Test inputs shape: torch.Size([400, 2])
Test observations shape: torch.Size([400, 2])
Test inputs dtype: torch.float32
Test inputs device: cuda:0
Test observations device: cuda:0


--- Training Run 1/1 ---

Start Training
curve dfNN Run 1/1, Epoch 1/4000, Training Loss (RMSE): 0.7634
curve dfNN Run 1/1, Epoch 2/4000, Training Loss (RMSE): 0.4332
curve dfNN Run 1/1, Epoch 3/4000, Training Loss (RMSE): 0.3683
curve dfNN Run 1/1, Epoch 4/4000, Training Loss (RMSE): 0.3086
curve dfNN Run 1/1, Epoch 5/4000, Training Loss (RMSE): 0.2671
curve dfNN Run 1/1, Epoch 6/4000, Training Loss (RMSE): 0.2995
curve dfNN Run 1/1, Epoch 7/4000, Training Loss (RMSE): 0.2747
curve dfNN Run 1/1, Epoch 8/4000, Training Loss (RMSE): 0.2826
curve dfNN Run 1/1, Epoch 

# Tests

The training loss ceiling is around 0.22
On train we do converge to close to 0

In [None]:
from simulate import simulate_detailed_curve
from utils import make_grid
from matplotlib import pyplot as plt

In [None]:
# long grid
_, x_grid = make_grid(N_SIDE)

vector_field, psi, psi_vector_field, simulated_vector_field = simulate_detailed_curve(x_test.cpu(), True)

In [None]:
def symplectic_field_of_merge(x):
    comp1 = ((x[:, 1] + 0.5)**3 / 3)
    comp2 = torch.cos(x[:, 0] * torch.pi) / torch.pi

    return comp1 + comp2

H_merge = symplectic_field_of_merge(x_grid)

In [None]:
# plt.imshow(H_merge.reshape(20, 20).cpu().detach().numpy())
# plt.imshow(psi.reshape(20, 20).cpu().detach().numpy())
H_sim = 0.2 * psi + 0.8 * H_merge
plt.imshow(H_sim.reshape(20, 20).cpu().detach().numpy())

In [None]:
plt.imshow(H_test.reshape(20, 20).cpu().detach().numpy())

In [None]:
print("H_test mean and std")
print(H_test.mean().item(), H_test.std().item())
# print(pred_v.norm(dim=1).mean().item())

print("H_test min and max")
print(H_test.min().item(), H_test.max().item())

In [None]:
import torch
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # for 3D plotting
import numpy as np

from utils import make_grid
x_grid_sq, x_grid = make_grid(N_SIDE)

# x_grid: [400, 2], assumed to be on a regular 20x20 grid
# H_test: [400, 1]
# Make sure to detach everything from the computation graph
H_vals = H_test.view(20, 20).cpu().detach().numpy()

# Recover grid for plotting
x = x_grid[:, 0].view(20, 20).cpu().numpy()
y = x_grid[:, 1].view(20, 20).cpu().numpy()

# 3D Surface Plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(x, y, H_vals, cmap='viridis')
ax.set_title("Scalar Potential H(x, y)")
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("H(x, y)")
plt.tight_layout()
plt.show()