In [None]:
def divergence_free_se_kernel_multi_input(X1_rows,
                              X2_rows, 
                              X1_columns, 
                              X2_columns, 
                              hyperparameters):
    
    # Extract hyperparameters
    sigma_f = hyperparameters[0]
    l = hyperparameters[1]
    
    # Calculate the (not quite Euclidean) distance between all pairs of points
    # This approach yields negative values as well
    # X1_dist: torch.Size([n_rows, n_columns])
    X1_dist = (X1_rows.unsqueeze(1) - X1_columns.unsqueeze(0)).squeeze()
    # ALTERNATIVE (pos.) torch.cdist(X1_rows, X1_columns): this is just torch.abs(X1_dist)

    # X2_dist: torch.Size([n_rows, n_columns])
    X2_dist = (X2_rows.unsqueeze(1) - X2_columns.unsqueeze(0)).squeeze()

    # torch.Size([n_rows, n_columns])
    upper_left = (1 - X2_dist.square().div(l**2)).div(l**2)

    # elementwise multiplication and division by scalar
    # Matlab version has negative values here! 
    upper_right = torch.mul(X1_dist, X2_dist).div(l**4)
    lower_left = upper_right
    lower_right = (1 - X1_dist.square().div(l**2)).div(l**2)

    # Concatenate upper and lower blocks column-wise, and then concatenate them row-wise
    # torch.Size([2 * n_train, 2 * n_test])
    block = torch.cat((torch.cat((upper_left, upper_right), 1), torch.cat((lower_left, lower_right), 1)), 0)

    # torch.Size([2 * n_train, 2 * n_test])
    # elementwise multiplication
    K = sigma_f.square() * block.mul((X1_dist.square() + X2_dist.square()).div(-2 * l**2).exp().tile(2, 2))

    return K

In [None]:
# seperate grid inputs and outputs: now tensors
def simulate_convergence(X1, X2):
    U = X2
    V = X1
    return U, V

def simulate_merge(X1, X2):
    U = (X2 + 0.5)**2
    V = np.sin(X1 * math.pi)
    return U, V

def simulate_branching(X1, X2):
    U = X1 * X2
    V = - 0.5 * X2**2 + (X1 - 0.8)
    return U, V

def simulate_deflection(X1, X2):
    U = (X2 * 6 - 3)**2 + (X1 * 6 - 3)**2 + 3
    V = -2 * (X2 * 6 - 3) * (X1 * 6 - 3)
    return U, V

def simulate_ridge(X1, X2):
    U = X2 + 1
    V = - np.cos(3 * X1**3 * math.pi)
    return U, V

In [None]:


##############################################################
### OPTIMIZATION ###
def optimize_hypers_on_train_df(
        hyperparameters_df_initial, 
        X_train, 
        Y_train_noisy, 
        X_test,
        max_optimisation_iterations = 1000,
        patience = 20,
        learning_rate = 0.0001):

        # Clone hyperparameters to avoid modifying the original tensor
        hyperparameters_df = hyperparameters_df_initial

        _, _, lml_initial = predict(
                X_train,
                Y_train_noisy,
                X_test,
                hyperparameters_df,
                divergence_free_bool = True)
        
        print(f"Initial hyperparameters: {hyperparameters_df[0].item():.3f}, "
              f"{hyperparameters_df[1].item():.3f}, "
              f"{hyperparameters_df[2].item():.3f}")
        print(f"Initial LML: {lml_initial.item():.2f}")
        
        # doesn't matter if we leave hyperparameters_df[0] in here
        optimizer = optim.Adam([hyperparameters_df[0], hyperparameters_df[1], hyperparameters_df[2]], lr = learning_rate)

        best_loss = float('inf') # initialse as infinity
        best_hypers = None
        no_improvement_count = 0

        for trial in range(max_optimisation_iterations):
                
                # Compute nlml
                _, _, lml = predict(
                X_train,
                Y_train_noisy,
                X_test,
                hyperparameters_df,
                divergence_free_bool = True)
                
                # We are minimising the negative log marginal likelihood, like a loss function
                loss = - lml # NLML

                # Check for improvement
                if loss < best_loss:
                        best_loss = loss.item() # If better than current, save loss and hypers
                        # we need to clone and not reference
                        best_hypers = [h.clone().detach() for h in hyperparameters_df]
                
                        no_improvement_count = 0  # Reset counter

                else:
                        no_improvement_count += 1  # Increase counter

                # Stop if loss has stagnated
                if no_improvement_count >= patience:
                # Printing current state
                        print(f"Stopping early after {trial+1} iterations.")
                        print(f"Best hyperparameters: {best_hypers[0].detach().numpy()[0]:.3f}, "
                              f"{best_hypers[1].detach().numpy()[0]:.3f}, "
                              f"{best_hypers[2].detach().numpy()[0]:.3f}")
                        print(f"Best LML: {(- best_loss):.2f}")
                        
                        break
                
                optimizer.zero_grad()  # Reset gradients
                loss.backward()  # Compute gradients
                optimizer.step()  # Update hypers

                # if trial % 10 == 0:  # Print every 10 iterations
                #        print(f"Current hyperparameters: {hyperparameters_df[0].detach().numpy()[0]:.3f}, "
                #              f"{hyperparameters_df[1].detach().numpy()[0]:.3f}, "
                #              f"{hyperparameters_df[2].detach().numpy()[0]:.3f}")
                #        print(f"Current NLML: {nlml.item():.2f}")
        
        return best_hypers

### OPTIMIZATION ###
def optimize_hypers_on_train_bd(
        hyperparameters_bd_initial, 
        X_train, 
        Y_train_noisy, 
        X_test,
        max_optimisation_iterations = 1000,
        patience = 20,
        learning_rate = 0.0001):

        # Clone hyperparameters to avoid modifying the original tensor
        hyperparameters_bd = hyperparameters_bd_initial
        
        optimizer = optim.Adam([hyperparameters_bd[0], hyperparameters_bd[1], hyperparameters_bd[2], hyperparameters_bd[3]], lr = learning_rate)

        best_loss = float('inf')
        best_hypers_bd = None
        no_improvement_count = 0

        for trial in range(max_optimisation_iterations):
                
                # Compute nlml
                _, _, nlml = predict(
                X_train,
                Y_train_noisy,
                X_test,
                hyperparameters_bd,
                divergence_free_bool = True)
                
                loss = - nlml

                # Check for improvement
                if loss < best_loss:
                        best_loss = loss.item()
                        best_hypers_bd = [h.clone().detach() for h in hyperparameters_bd]
                
                        no_improvement_count = 0  # Reset counter

                else:
                        no_improvement_count += 1  # Increase counter

                # Stop if loss has stagnated
                if no_improvement_count >= patience:
                # Printing current state

                        print(f"Stopping early after {trial+1} iterations.")
                        print(f"Best hyperparameters: {best_hypers_bd[0].item():.3f}, "
                              f"{best_hypers_bd[1].item():.3f}, "
                              f"{best_hypers_bd[2].item():.3f}")
                        print(f"Best NLML: {best_loss:.2f}")
                        
                        break

                optimizer.zero_grad()  # Reset gradients
                loss.backward()  # Compute gradients
                optimizer.step()  # Update hypers

                # if trial % 10 == 0:  # Print every 10 iterations
                #        print(f"Current hyperparameters: {hyperparameters_bd[0].detach().numpy()[0]:.3f}, "
                #              f"{hyperparameters_bd[1].detach().numpy()[0]:.3f}, "
                #              f"{hyperparameters_bd[2].detach().numpy()[0]:.3f}")
                #        print(f"Current NLML: {nlml.item():.2f}")
        
        return best_hypers_bd


In [None]:
def divergence_free_se_kernel_l2d(
        row_tensor, # torch.Size([n_rows, 2])
        column_tensor, # torch.Size([n_columns, 2])
        hyperparameters):
    
    """
    Calculate the divergence-free SE kernel for two sets of points in 2D space.
    R^2 -> R^2

    Inputs:
        row_tensor: torch.Size([n_rows, 2])
        column_tensor: torch.Size([n_columns, 2])
        hyperparameters: list of length 3 containing sigma_n, sigma_f and l

    Returns:
        K: torch.Size([n_rows * 2, n_columns * 2])
    """
    
    # We calculate the kernel for each pair of points
    
    # Extract hyperparameters (except for sigma_n)
    # sigma_f_squared = torch.exp(hyperparameters[1]) # torch.exp(log_sigma_f_squared)
    # sigma_f_squared = hyperparameters[1]
    sigma_f = hyperparameters[1] 
    l = hyperparameters[2]

    # Accommodate for single or double lengthscale
    if l.shape == torch.Size([1]):
        l1 = l
        l2 = l
    else:
        l1 = l[0]
        l2 = l[1]

    # Add dimension (broadcasting) for difference calculation
    # torch.Size([n_rows, 1, 2]) - 1 is for n_columns
    row_tensor_expanded = row_tensor[:, None, :]
    # torch.Size([1, n_columns, 2]) - 1 is for n_rowns
    column_tensor_expanded = column_tensor[None, :, :]

    # Calculate differences for x-coordinate "features" as well as y-coordinate "features"
    # [:, :, 0] are the x1 differences and [:, :, 1] are the x2 differences
    # yields negative values as well
    diff = row_tensor_expanded - column_tensor_expanded

    ### 2x2 BLOCKS ###
    # x2 diffs: torch.Size([n_rows, n_columns])
    upper_left = (1 - diff[:, :, 1].square().div(l2.square())).div(l2.square())

    # x1 diffs: torch.Size([n_rows, n_columns])
    lower_right = (1 - diff[:, :, 0].square().div(l1.square())).div(l1.square())

    # Elementwise multiplication of x1 and x2 diffs and division by scalar
    # Matlab version has negative values here!
    # Combined at x1 and x2 diffs
    # l4 squared
    upper_right = torch.prod(diff, dim = -1).div(l1.square() * l2.square())

    # same as other off-diagonal block
    lower_left = upper_right

    # Concatenate upper and lower blocks column-wise, and then concatenate them row-wise
    # torch.Size([2 * n_train, 2 * n_test])
    blocks = torch.cat((
        torch.cat((upper_left, upper_right), 1), 
        torch.cat((lower_left, lower_right), 1)
        ), 0)

    # torch.Size([2 * n_row, 2 * n_column])
    # elementwise multiplication
    # sum squared difference over x1 and x2, divide by -2 * l^2, and exponentiate. Tile for blocks
    K = sigma_f.square() * blocks.mul(diff.square().sum(dim = -1).div(-2 * l1 * l2).exp().tile(2, 2))

    return K