# Start with dfGP for real data

- check if noise is enough
- liklihood: observation specific noise

In [1]:
model_name = "dfGP"
from gpytorch_models import dfGP

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
from configs import TRACK_EMISSIONS_BOOL

# Reiterating import for visibility
MAX_NUM_EPOCHS = MAX_NUM_EPOCHS
MAX_NUM_EPOCHS = 20
NUM_RUNS = NUM_RUNS
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_REAL_LEARNING_RATE")
MODEL_REAL_RESULTS_DIR = getattr(configs, f"{model_name}_REAL_RESULTS_DIR")
import os
os.makedirs(MODEL_REAL_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

### START TIMING ###
import time
start_time = time.time()  # Start timing after imports

### START TRACKING EXPERIMENT EMISSIONS ###
if TRACK_EMISSIONS_BOOL:
    from codecarbon import EmissionsTracker
    tracker = EmissionsTracker(project_name = "dfGP_real_experiments", output_dir = MODEL_REAL_RESULTS_DIR)
    tracker.start()

#############################
### LOOP 1 - over REGIONS ###
#############################

# for region_name in ["region_lower_byrd", "region_mid_byrd", "region_upper_byrd"]:
for region_name in ["region_lower_byrd"]:

    print(f"\nTraining for {region_name.upper()}...")

    # Store metrics for the current region (used for *metrics_summary* report and *metrics_per_run*)
    region_results = []

    ##########################################
    ### x_train & y_train, x_test & x_test ###
    ##########################################

    # define paths based on region_name
    path_to_training_tensor = "data/real_data/" + region_name + "_train_tensor.pt"
    path_to_test_tensor = "data/real_data/" + region_name + "_test_tensor.pt"

    # load and tranpose to have rows as points
    train = torch.load(path_to_training_tensor, weights_only = False).T 
    test = torch.load(path_to_test_tensor, weights_only = False).T

    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice flux error in x direction (u_err)
    # [:, 6] = ice flux error in y direction (v_err)
    # [:, 7] = source age

    # train
    x_train = train[:, [0, 1]].to(device)
    y_train = train[:, [3, 4]].to(device)

    # test
    x_test = test[:, [0, 1]].to(device)
    y_test = test[:, [3, 4]].to(device)

    x_test = x_test
    y_test = y_test

    x_train = x_train * 10
    x_test = x_test * 10

    # NOTE: Here we estimate the noise variance 
    """
    ### NOISE MODEL ###
    # TRAIN
    # noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
    noise_var_h_times_uv_train = torch.concat((train[:, 5], train[:, 6]), dim = 0)**2
    # assume age dependent noise sigma_h on ice thickness measurements: ~10 - 20 m std (1000 scaling)
    sigma_h = 0.01 * torch.log(train[:, 7] + 3)
    # calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
    noise_var_uv_times_h_train = (torch.concat((train[:, 3], train[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
    # combine both noise variances into the std for each dimension
    train_noise_diag = torch.sqrt(noise_var_h_times_uv_train + noise_var_uv_times_h_train).to(device)

    # Compute midpoint
    midpoint = train_noise_diag.shape[0] // 2

    # Print noise levels for train, formatted to 4 decimal places
    print(f"Mean noise std per x dimension: {train_noise_diag[:midpoint].mean(dim = 0).item():.4f}")
    print(f"Mean noise std per y dimension: {train_noise_diag[midpoint:].mean(dim = 0).item():.4f}")

    # TEST
    # noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
    noise_var_h_times_uv_test = torch.concat((test[:, 5], test[:, 6]), dim = 0)**2
    # assume age dependent noise sigma_h on ice thickness measurements: ~10 - 20 m std (1000 scaling)
    sigma_h = 0.01 * torch.log(test[:, 7] + 3)
    # calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
    noise_var_uv_times_h_test = (torch.concat((test[:, 3], test[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
    # combine both noise variances into the std for each dimension
    test_noise_diag = torch.sqrt(noise_var_h_times_uv_test + noise_var_uv_times_h_test).to(device)
    """

    # Print train details
    print(f"=== {region_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print()

    # Print test details
    print(f"=== {region_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print()

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the dfGP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            rank = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)

        model = dfGP(
            x_train,
            y_train, 
            likelihood
            ).to(device)
        
        model.likelihood.noise = torch.tensor([0.02]).to(device)
        model.covar_module.outputscale = torch.tensor([1.8]).to(device)
        model.covar_module.base_kernel.lengthscale = torch.tensor([0.5, 0.3]).to(device)
        
        optimizer = torch.optim.AdamW(model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)
        
        # Use ExactMarginalLogLikelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        model.train()
        likelihood.train()
        # _________________
        # BEFORE EPOCH LOOP
        
        # Export the convergence just for first run only
        if run == 0:
            # initialise tensors to store losses over epochs (for convergence plot)
            train_losses_NLML_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # objective
            train_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # by-product
            # monitor performance transfer to test (only RMSE easy to calc without covar)
            test_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

            # NOTE: Here, we estimate the noise
            l1_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            l2_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            outputscale_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            noise_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
            # Train on noisy or targets
            # NOTE: We only have observational y_train i.e. noisy data
            loss = - mll(train_pred_dist, y_train.reshape(-1).to(device))  # negative marginal log likelihood
            loss.backward()
            optimizer.step()

            if run == 0:

                model.eval()
                likelihood.eval()
                
                with gpytorch.settings.debug(False):
                    with torch.no_grad():
                            # Only in eval it computes it the right way
                            train_pred_dist_eval = model(x_train.to(device))
                    test_pred_dist_eval = model(x_test.to(device))

                # Compute RMSE for training and test predictions (given true data, not noisy)
                train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(train_pred_dist_eval, y_train.to(device)).mean())
                test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(test_pred_dist_eval, y_test.to(device)).mean())

                # Save losses for convergence plot
                train_losses_NLML_over_epochs[epoch] = loss.item()
                train_losses_RMSE_over_epochs[epoch] = train_RMSE.item()
                test_losses_RMSE_over_epochs[epoch] = test_RMSE.item()

                # Save evolution of hypers for convergence plot
                l1_over_epochs[epoch] = model.base_kernel.lengthscale[0].item()
                l2_over_epochs[epoch] = model.base_kernel.lengthscale[1].item()
                outputscale_var_over_epochs[epoch] = model.covar_module.outputscale.item()
                noise_var_over_epochs[epoch] = model.likelihood.noise.item()

                # Print a bit more information for the first run
                if epoch % 20 == 0:
                    print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}, Training RMSE: {train_RMSE:.4f}")

                # delete after printing and saving
                # NOTE: keep loss for early stopping check
                # del train_pred_dist, test_pred_dist, train_RMSE, test_RMSE
                
                # Free up memory every 20 epochs
                if epoch % 20 == 0:
                    gc.collect() and torch.cuda.empty_cache()

        ##############################
        ### END LOOP 3 over EPOCHS ###
        ##############################

        # for every run...
        #######################################################
        ### EVALUATE after all training for RUN is finished ###
        #######################################################

        model.eval()
        likelihood.eval()

        # Underlying (latent) distribution and predictive distribution
        # with gpytorch.settings.debug(False):
        #    with torch.no_grad():
                # dist_train = model(x_train)
                # pred_dist_train = likelihood(dist_train)
                # Make it interleaved structure?

        import sys

        def trace_calls(frame, event, arg):
            if event != 'call':
                return
            code = frame.f_code
            func_name = code.co_name
            filename = code.co_filename
            lineno = frame.f_lineno
            print(f"Call to {func_name} in {filename}:{lineno}")
            return trace_calls

        sys.settrace(trace_calls)

        # Your GP prediction
        dist_test = model(x_test)


        dist_test = model(x_test)
        
        sys.settrace(None)  # Turn off tracing afterward
        pred_dist_test = likelihood(dist_test)

        # with warnings.catch_warnings():
        #    warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
            # dist_train = model(x_train_grad)
        
        # pred_dist_train = likelihood(dist_train)
        
        # Compute divergence field (from latent distribution)
        # test_div_field = compute_divergence_field(dist_test.mean, x_test_grad)
        # train_div_field = compute_divergence_field(dist_train.mean, x_train_grad)

        # Compute TEST metrics (convert tensors to float) for every run's tuned model
        test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_test, y_test.to(device)).mean()).item()
        test_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_test, y_test.to(device)).mean().item()
        print("Here")
        test_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_test, y_test.to(device)).item()
        test_QCE = quantile_coverage_error_2d(
            pred_dist_test, y_test.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        # test_MAD = test_div_field.abs().mean().item()


Using device: cuda


Training for REGION_LOWER_BYRD...
=== REGION_LOWER_BYRD ===
Training inputs shape: torch.Size([536, 2])
Training observations shape: torch.Size([536, 2])
Training inputs dtype: torch.float32

=== REGION_LOWER_BYRD ===
Test inputs shape: torch.Size([293, 2])
Test observations shape: torch.Size([293, 2])
Test inputs dtype: torch.float32


--- Training Run 1/1 ---

Start Training


AttributeError: 'MultivariateNormal' object has no attribute '_interleaved'

In [None]:
pred_dist_test.batch_shape

In [None]:
pred_dist_test._batch_shape
pred_dist_test._event_shape
pred_dist_test._extended_shape
pred_dist_test._interleaved
pred_dist_test._validate_args # Why?
pred_dist_test.covariance_matrix - pred_dist_test.covariance_matrix.T

In [None]:
pred_dist_test.covariance_matrix - pred_dist_test.covariance_matrix.T

In [None]:
torch.set_printoptions(precision = 3, sci_mode = False)
pred_dist_test.covariance_matrix[0:4, 0:4]

In [None]:
from gpytorch_models import dfRBFKernel
my_kernel = dfRBFKernel()

noise = torch.tensor([0.02]).to(device)
outputscale = torch.tensor([1.8]).to(device)
my_kernel.lengthscale = torch.tensor([0.5, 0.3]).to(device)

K_train_train = my_kernel(x_train, x_train).evaluate().detach()
K_test_train = my_kernel(x_test, x_train).evaluate().detach()

In [None]:
torch.set_printoptions(precision = 4, sci_mode = False)
K_test_train[0:4, 0:4]

In [None]:
(K_train_train - K_train_train.T).max()

In [None]:
import matplotlib.pyplot as plt
plt.imshow(train_pred_dist_eval.covariance_matrix.detach().cpu())

In [None]:
out = dist_test.from_batch_mvn(dist_test, interleaved = False)

In [None]:
print(testy._interleaved)
testy.covariance_matrix - testy.covariance_matrix.T

In [None]:
import gpytorch

gpytorch.settings.debug(False)

In [None]:
isinstance(train_pred_dist, gpytorch.distributions.MultitaskMultivariateNormal) 

# train pred is block diagonal & symmetric
print(train_pred_dist._interleaved)
print(pred_dist_test._interleaved)

In [None]:
train_NLL = gpytorch.metrics.negative_log_predictive_density(
            likelihood(train_pred_dist), y_train.to(device)).item()
print(f"Train NLL: {train_NLL:.4f}")
# For train it helps to wrap a large likelihood around the train_pred_dist

(train_pred_dist.covariance_matrix - train_pred_dist.covariance_matrix.T).max()

In [None]:
print(model.likelihood.noise.item())
print(model.covar_module.outputscale.item())

In [None]:
# train_NLL = gpytorch.metrics.negative_log_predictive_density(
#            likelihood(pred_dist_test), y_train.to(device)).item()
# print(f"Train NLL: {train_NLL:.4f}")
# For train it helps to wrap a large likelihood around the train_pred_dist

(pred_dist_test.covariance_matrix - pred_dist_test.covariance_matrix.T).max()

# Look 

In [None]:
pred_dist_test.covariance_matrix - pred_dist_test.covariance_matrix.T

In [None]:
pred_dist_test.covariance_matrix
import matplotlib.pyplot as plt

# Confusing block and interleaved structure?
plt.imshow(pred_dist_test.covariance_matrix.cpu().numpy(), cmap = 'viridis')

In [None]:
y_train.transpose(-1, -2).reshape(*y_train.shape[:-2], -1)

In [None]:
y_train.shape[:-2]

In [None]:
y_train.reshape(*y_train.shape[:-2], -1)

In [None]:
plt.imshow(train_pred_dist.covariance_matrix.detach().cpu().numpy(), cmap = 'viridis')

In [None]:
def block_to_interleaved(K_block):
    """
    Convert a (2N, 2N) block covariance matrix to an interleaved (2N, 2N) format.
    Assumes that K_block is ordered as:
        [ K_uu  K_uv ]
        [ K_vu  K_vv ]
    """
    N = K_block.shape[-1] // 2

    # Extract blocks
    K_uu = K_block[:N, :N]
    K_uv = K_block[:N, N:]
    K_vu = K_block[N:, :N]
    K_vv = K_block[N:, N:]

    # Allocate interleaved matrix
    K_interleaved = torch.zeros_like(K_block)

    # Fill interleaved format
    K_interleaved[0::2, 0::2] = K_uu
    K_interleaved[0::2, 1::2] = K_uv
    K_interleaved[1::2, 0::2] = K_vu
    K_interleaved[1::2, 1::2] = K_vv

    return K_interleaved

plt.imshow(block_to_interleaved(train_pred_dist.covariance_matrix.detach().cpu())[0:100, 0:100], cmap = 'viridis')

In [None]:
K_uv

In [None]:
block_to_interleaved(train_pred_dist.covariance_matrix.detach().cpu())

In [None]:
# 1. Assume you have the original model and its init args
original_model = dfGP(
    train_x, 
    train_y, 
    likelihood)

original_model.train()  # (or eval, if you want to copy at that stage)

# 2. Save state dict
state = original_model.state_dict()

# 3. Re-instantiate a clean model and load weights
copied_model = MyGPModel(train_x, train_y, likelihood)
copied_model.load_state_dict(state)

In [None]:
(train_pred_dist.covariance_matrix - train_pred_dist.covariance_matrix.T)

In [None]:
for name, value in model.named_parameters():
    print(f"{name}: {value}")

In [None]:
pred_dist_test.covariance_matrix

In [None]:
# REAL DATA EXPERIMENTS
# RUN WITH python run_real_experiments_dfGP.py
#               _                 _   _      
#              | |               | | (_)     
#    __ _ _ __ | |_ __ _ _ __ ___| |_ _  ___ 
#   / _` | '_ \| __/ _` | '__/ __| __| |/ __|
#  | (_| | | | | || (_| | | | (__| |_| | (__ 
#   \__,_|_| |_|\__\__,_|_|  \___|\__|_|\___|
# 
model_name = "dfGP"
from gpytorch_models import dfGP, dfRBFKernel

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY, N_SIDE

# Reiterating import for visibility
MAX_NUM_EPOCHS = MAX_NUM_EPOCHS
MAX_NUM_EPOCHS = 1
NUM_RUNS = NUM_RUNS
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_REAL_LEARNING_RATE")
MODEL_REAL_RESULTS_DIR = getattr(configs, f"{model_name}_REAL_RESULTS_DIR")
import os
os.makedirs(MODEL_REAL_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

#############################
### LOOP 1 - over REGIONS ###
#############################

# for region_name in ["region_lower_byrd"]:
# for region_name in ["region_lower_byrd", "region_mid_byrd", "region_upper_byrd"]:
for region_name in ["region_mid_byrd"]:

    print(f"\nTraining for {region_name.upper()}...")

    # Store metrics for the current region (used for *metrics_summary* report and *metrics_per_run*)
    region_results = []

    ##########################################
    ### x_train & y_train, x_test & x_test ###
    ##########################################

    # define paths based on region_name
    path_to_training_tensor = "data/real_data/" + region_name + "_train_tensor.pt"
    path_to_test_tensor = "data/real_data/" + region_name + "_test_tensor.pt"

    # load and tranpose to have rows as points
    train = torch.load(path_to_training_tensor, weights_only = False).T 
    test = torch.load(path_to_test_tensor, weights_only = False).T

    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice flux error in x direction (u_err)
    # [:, 6] = ice flux error in y direction (v_err)
    # [:, 7] = source age

    # train
    x_train = train[:, [0, 1]].to(device)
    y_train = train[:, [3, 4]].to(device)

    # test
    x_test = test[:, [0, 1]].to(device)
    y_test = test[:, [3, 4]].to(device)

    _, x_test_grid = make_grid(N_SIDE)

    # Print train details
    print(f"=== {region_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print()

    # Print test details
    print(f"=== {region_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print()

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the dfGP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)
        
        # likelihood.noise = torch.tensor([0.02], device = device)  # initial noise variance (global noise)

        # Intialise fresh GP model with flat x_train and y_train_noisy (block-flat)
        model = dfGP(
            x_train,
            y_train, 
            likelihood
            ).to(device)
        
        # model.covar_module.outputscale = torch.tensor(10.0, device = device)
        # model.base_kernel.register_constraint("raw_lengthscale", gpytorch.constraints.GreaterThan(0.3))
        
        optimizer = torch.optim.AdamW(model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)
        
        # Use ExactMarginalLogLikelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
    
            # loss = - mll(train_pred_dist, y_train.to(device))  # negative marginal log likelihood
            # loss.backward()
            # optimizer.step()

        print("Training was no problem ")
        
        model.eval()
        likelihood.eval()

        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            # train_pred_dist = model(x_train.to(device))

            dist_test = model(x_test)
            pred_dist_test = likelihood(dist_test)

            print("No problem yet")
            test_NLL = gpytorch.metrics.negative_log_predictive_density(
                    pred_dist_test, y_test.to(device)).item()

In [None]:
train_pred_dist.covariance_matrix[0:5, 0:5]

In [None]:
pred_dist_test.covariance_matrix[0:5, 0:5]

In [None]:
torch.set_printoptions(precision = 3, sci_mode=False)
dist_test.covariance_matrix[0:5, 0:5]

In [None]:
train_pred_dist.covariance_matrix[0:5, 0:5]

In [None]:
kernel = dfRBFKernel()
kernel = model.covar_module

K_test_test = kernel(x_test).evaluate()
# same as K_test_test = kernel(x_test, x_test).evaluate()

K_train_train = kernel(x_train).evaluate()

K_test_train = kernel(x_test, x_train).evaluate()

(K_train_train - K_train_train.T).max()

In [None]:
torch.set_printoptions(precision=3, sci_mode=False)
K_train_train

In [None]:
K_test_train

In [None]:
K_test_train

In [None]:
train_pred_dist.covariance_matrix

In [None]:
dist_test.covariance_matrix[0:5, 0:5]

In [None]:
# SIMULATED DATA EXPERIMENTS
# RUN WITH python run_sim_experiments_dfGP.py
# 
#       ooooooooooooooooooooooooooooooooooooo
#      8                                .d88
#      8  oooooooooooooooooooooooooooood8888
#      8  8888888888888888888888888P"   8888    oooooooooooooooo
#      8  8888888888888888888888P"      8888    8              8
#      8  8888888888888888888P"         8888    8             d8
#      8  8888888888888888P"            8888    8            d88
#      8  8888888888888P"               8888    8           d888
#      8  8888888888P"                  8888    8          d8888
#      8  8888888P"                     8888    8         d88888
#      8  8888P"                        8888    8        d888888
#      8  8888oooooooooooooooooooooocgmm8888    8       d8888888
#      8 .od88888888888888888888888888888888    8      d88888888
#      8888888888888888888888888888888888888    8     d888888888
#                                               8    d8888888888
#         ooooooooooooooooooooooooooooooo       8   d88888888888
#        d                       ...oood8b      8  d888888888888
#       d              ...oood888888888888b     8 d8888888888888
#      d     ...oood88888888888888888888888b    8d88888888888888
#     dood8888888888888888888888888888888888b
#
#
# This artwork is a visual reminder that this script is for the sim experiments.

model_name = "dfGP"
from gpytorch_models import dfGP

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
# also import x_test grid size and std noise for training data
from configs import N_SIDE, STD_GAUSSIAN_NOISE
from configs import TRACK_EMISSIONS_BOOL

# Reiterating import for visibility
MAX_NUM_EPOCHS = 100
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_SIM_LEARNING_RATE")
MODEL_SIM_RESULTS_DIR = getattr(configs, f"{model_name}_SIM_RESULTS_DIR")
import os
os.makedirs(MODEL_SIM_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

### START TIMING ###
import time
start_time = time.time()  # Start timing after imports

### START TRACKING EXPERIMENT EMISSIONS ###
if TRACK_EMISSIONS_BOOL:
    from codecarbon import EmissionsTracker
    tracker = EmissionsTracker(project_name = "dfGP_simulation_experiments", output_dir = MODEL_SIM_RESULTS_DIR)
    tracker.start()

### SIMULATION ###
# Import all simulation functions
from simulate import (
    simulate_detailed_branching,
    simulate_detailed_curve,
    simulate_detailed_deflection,
    simulate_detailed_edge,
    simulate_detailed_ridges,
)

# Define simulations as a dictionary with names as keys to function objects
# alphabectic order here
simulations = {
    "curve": simulate_detailed_curve,
}

########################
### x_train & x_test ###
########################

# Load training inputs (once for all simulations)
x_train = torch.load("data/sim_data/x_train_lines_discretised_0to1.pt", weights_only = False).float()

# Generate x_test (long) once for all simulations
_, x_test = make_grid(N_SIDE)
# x_test is long format (N_SIDE ** 2, 2)

#################################
### LOOP 1 - over SIMULATIONS ###
#################################

# Make y_train_dict: Iterate over all simulation functions
for sim_name, sim_func in simulations.items():

    ########################
    ### y_train & y_test ###
    ########################

    # Generate training observations
    # NOTE: sim_func() needs to be on CPU, so we move x_train to CPU
    y_train = sim_func(x_train.cpu()).to(device)
    y_test = sim_func(x_test.cpu()).to(device)
    
    x_test = x_test.to(device)
    x_train = x_train.to(device)

    # Print details
    print(f"=== {sim_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print(f"Training inputs device: {y_train.device}")
    print(f"Training observations device: {y_train.device}")
    print()

    # Print details
    print(f"=== {sim_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print(f"Test inputs device: {x_test.device}")
    print(f"Test observations device: {y_test.device}")
    print()

    # NOTE: This is different to the real data experiments
    # calculate the mean magnitude of the test data as we use this to scale the noise
    sim_mean_magnitude_for_noise = torch.norm(y_test, dim = -1).mean().to(device)
    sim_noise = STD_GAUSSIAN_NOISE * sim_mean_magnitude_for_noise

    # Store metrics for the simulation (used for *metrics_summary* report and *metrics_per_run*)
    simulation_results = [] 

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Additive noise model: independent Gaussian noise
        # For every run we have a FIXED NOISY TARGET. Draw from standard normal with appropriate std
        y_train_noisy = y_train + (torch.randn(y_train.shape, device = device) * sim_noise)

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the dfGP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)

        # Intialise fresh GP model with flat x_train and y_train_noisy (block-flat)
        model = dfGP(
            x_train,
            y_train_noisy, 
            likelihood
            ).to(device)
        
        # NOTE: model parameters contains likelihood parameters as well
        optimizer = torch.optim.AdamW(model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)
        
        # Use ExactMarginalLogLikelihood as the reward i.e. inverse loss function
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        model.train()
        likelihood.train()
        
        # _________________
        # BEFORE EPOCH LOOP

        # Export the convergence just for first run only
        if run == 0:
            # initialise tensors to store losses over epochs (for convergence plot)
            train_losses_NLML_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # objective
            train_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # by-product
            # monitor performance transfer to test (only RMSE easy to calc without covar)
            test_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

            l1_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            l2_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            outputscale_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            noise_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################

        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
            # Train on noisy or targets
    
            loss = - mll(train_pred_dist, y_train_noisy.to(device))  # negative marginal log likelihood
            loss.backward()
            optimizer.step()

            # For Run 1 we save a bunch of metrics and update, while for the rest we only update
            if run == 0:

                model.eval()
                likelihood.eval()

                with torch.no_grad():
                # with warnings.catch_warnings():
                #    warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
                #    train_pred_dist = model(x_train.to(device))
                    test_pred_dist = model(x_test.to(device))

                # Compute RMSE for training and test predictions (given true data, not noisy)
                train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(train_pred_dist, y_train.to(device)).mean())
                test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(test_pred_dist, y_test.to(device)).mean())

                # Save losses for convergence plot
                train_losses_NLML_over_epochs[epoch] = loss.item()
                train_losses_RMSE_over_epochs[epoch] = train_RMSE.item()
                test_losses_RMSE_over_epochs[epoch] = test_RMSE.item()

                # Save evolution of hypers for convergence plot
                l1_over_epochs[epoch] = model.base_kernel.lengthscale[0].item()
                l2_over_epochs[epoch] = model.base_kernel.lengthscale[1].item()
                outputscale_var_over_epochs[epoch] = model.covar_module.outputscale.item()
                noise_var_over_epochs[epoch] = model.likelihood.noise.item()

                # Print a bit more information for the first run
                if epoch % 20 == 0:
                    print(f"{sim_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}, Training RMSE: {train_RMSE:.4f}")

In [None]:
from linear_operator.operators import to_linear_operator 
from linear_operator.operators import CatLinearOperator

l1, l2 = model.covar_module.base_kernel.lengthscale[0], model.covar_module.base_kernel.lengthscale[1]
diff = x_test[:, None, :] - x_train[None, :, :]
r1, r2 = diff[..., 0], diff[..., 1]
exp_term = to_linear_operator(torch.exp(-0.5 * ((r1 / l1)**2 + (r2 / l2)**2)))

K_uu = to_linear_operator((1 - (r2**2 / l2**2)) / l2**2)
K_uv = to_linear_operator((r1 * r2) / (l1**2 * l2**2))
K_vv = to_linear_operator((1 - (r1**2 / l1**2)) / l1**2)


# STEP 4: Combine and stack
# Final scaled components (each shape N × M)
K_uu = K_uu * exp_term
K_uv = K_uv * exp_term
K_vu = K_uv # NOTE: K_vu is equal to K_uv
K_vv = K_vv * exp_term

# Row-wise stacking: (N, 2M)
top = CatLinearOperator(K_uu.expand(0), K_uv.expand(0), dim = 1)
bottom = CatLinearOperator(K_vu, K_vv, dim = 1)
K_block = CatLinearOperator(top, bottom, dim = 0)

# Final 2N × 2M block matrix
# K_block = CatLinearOperator(top, bottom, dim = 0)
K_block.to_dense().shape

In [None]:
K_uv.to_dense()[0:4, 0:4]

In [None]:
K_uv.to_dense().T.shape

In [None]:
from linear_operator.operators import to_linear_operator 
from linear_operator.operators import CatLinearOperator

l1, l2 = model.covar_module.base_kernel.lengthscale[0], model.covar_module.base_kernel.lengthscale[1]
diff = x_test[:, None, :] - x_train[None, :, :]
r1, r2 = diff[..., 0], diff[..., 1]
exp_term = to_linear_operator(torch.exp(-0.5 * ((r1 / l1)**2 + (r2 / l2)**2)))

K_uu = to_linear_operator((1 - (r2**2 / l2**2)) / l2**2)
K_uv = to_linear_operator((r1 * r2) / (l1**2 * l2**2))
K_vv = to_linear_operator((1 - (r1**2 / l1**2)) / l1**2)


# STEP 4: Combine and stack
# Final scaled components (each shape N × M)
K_uu = K_uu * exp_term
K_uv = K_uv * exp_term
K_vu = K_uv # NOTE: K_vu is equal to K_uv
K_vv = K_vv * exp_term

In [None]:
import linear_operator

Z_uu = linear_operator.operators.ZeroLinearOperator(2, 2)

I = linear_operator.operators.IdentityLinearOperator((2))

ones = torch.ones(2, 2)

M = linear_operator.operators.MaskedLinearOperator(
    base = ones,
    row_mask = torch.tensor([True, False]),
    col_mask = torch.tensor([False, False]))

# It slices, it does not mask
M.to_dense()

indicator = torch.zeros(2, 2)
indicator[0, 0] = 1.0
indicator  

I = linear_operator.operators.IdentityLinearOperator((2))
inticator

In [None]:
import linear_operator.operators.kronecker_product_linear_operator

### Step 1: Create inticators for "interleaving" Kronecker product
K_uu_indicator = torch.zeros(2, 2).to(device)
K_uu_indicator[0, 0] = 1.0
K_uu_indicator_lo = to_linear_operator(indicator)

K_uv_indicator = torch.zeros(2, 2).to(device)
K_uv_indicator[0, 1] = 1.0
K_uv_indicator_lo = to_linear_operator(K_uv_indicator)

K_vu_indicator = torch.zeros(2, 2).to(device)
K_vu_indicator[1, 0] = 1.0
K_vu_indicator_lo = to_linear_operator(K_vu_indicator)

K_vv_indicator = torch.zeros(2, 2).to(device)
K_vv_indicator[1, 1] = 1.0
K_vv_indicator_lo = to_linear_operator(K_vv_indicator)

# Step 2: Create Kronecker product linear operators
K_uu_expand = linear_operator.operators.KroneckerProductLinearOperator(
    K_uu,
    K_uu_indicator_lo, # NOTE: The order is important here, it is not commutative
)

K_uv_expand = linear_operator.operators.KroneckerProductLinearOperator(
    K_uv,
    K_uv_indicator_lo, # NOTE: The order is important here, it is not commutative
)

K_vu_expand = linear_operator.operators.KroneckerProductLinearOperator(
    K_vu,
    K_vu_indicator_lo, # NOTE: The order is important here, it is not
)

K_vv_expand = linear_operator.operators.KroneckerProductLinearOperator(
    K_vv,
    K_vv_indicator_lo, # NOTE: The order is important here, it is not commutative
)

K_interleave = K_uu_expand + K_uv_expand + K_vu_expand + K_vv_expand

In [None]:
K_interleave = K_uu_expand + K_uv_expand + K_vu_expand + K_vv_expand

print(K_interleave.to_dense())

In [None]:
K_uu.shape

torch.sum(K_uu_expand, K_uv_expand)

In [None]:
K_uu_expand.sum(K_uv_expand).shape

In [None]:
print(K_uv_expand.to_dense())

In [None]:
import linear_operator

nothong = linear_operator.operators.ZeroLinearOperator((10, 10))
linear_operator.operators.BlockDiagLinearOperator(base_linear_op = nothong)

classlinear_operator.operators.KernelLinearOperator(x1, x2, covar_func, num_outputs_per_input=(1, 1), num_nonbatch_dimensions=None, **params)[source]
# https://linear-operator.readthedocs.io/en/latest/data_sparse_operators.html

In [None]:
torch.stack([K_uu.to_dense(), K_uv.to_dense()], dim = -1).shape

In [None]:
nothong.to_dense()

In [None]:
def interleave_permutation(n):
    # Returns indices to interleave [0, n), [n, 2n) → [0, n, 1, n+1, ..., n-1, 2n-1]
    idx = torch.arange(n)
    return (torch.stack([idx, idx + n], dim=1)
                .reshape(-1))

N, M = K_uu.shape[0], K_uu.shape[1]  # assuming all blocks have shape (N, M)

perm_rows = interleave_permutation(N)
perm_cols = interleave_permutation(M)

# Use these to permute via gather or torch.index_select if dense
# OR to build permutation matrices for matmul
P_row = torch.eye(2 * N)[perm_rows].to(device)
P_col = torch.eye(2 * M)[perm_cols].to(device)

# If your K_block is a dense tensor
K_interleaved = P_row @ K_block @ P_col.T

In [None]:
plt.imshow(K_interleaved.to_dense().detach().cpu().numpy(), cmap='viridis')

In [None]:
# Build interleaved rows: each is [u_row, v_row]
rows = []
for i in range(K_uu.shape[0] * 2):  # iterate over N
    row_u = CatLinearOperator(K_uu[i, :], K_uv[i, :], dim = 0)
    row_v = CatLinearOperator(K_vu[i, :], K_vv[i, :], dim = 0)
    rows.append(row_u)
    rows.append(row_v)

# Stack rows interleaved → shape (2N, 2M)
K_interleaved = CatLinearOperator(*rows, dim=-2)

K_interleaved.to_dense
import matplotlib.pyplot as plt
plt.imshow(K_interleaved.to_dense().detach().cpu().numpy(), cmap = 'viridis')

In [None]:
row_u = CatLinearOperator(K_uu[0, :].expand(1, -1), K_uv[0, :].expand(1, -1), dim = 0)

In [None]:
K_uu[0].expand(1, -1)

In [None]:
top = CatLinearOperator(K_uu.expand(1, -1, -1), K_uv.expand(1, -1, -1), dim = 0)
bottom = CatLinearOperator(K_vu.expand(1, -1, -1), K_vv.expand(1, -1, -1), dim = 0)
block = CatLinearOperator(top.expand(1, 2, -1, -1), bottom.expand(1, 2, -1, -1), dim = 0)
block.shape
out = block.permute(1, 0, 2, 3)
# torch.Size([2, 2, 293, 536])
out.view(2, 2*293, 536)

In [None]:
K_block.shape

In [None]:
top.expand(1, 2, -1, -1)

In [None]:
K_uu.expand(1, -1, -1).shape

In [None]:
K_block.permute()

In [None]:
train_pred_dist.covariance_matrix - train_pred_dist.covariance_matrix.T

In [None]:
(test_pred_dist.covariance_matrix - test_pred_dist.covariance_matrix.T)

In [None]:
print(train_pred_dist._interleaved)
print(test_pred_dist._interleaved)

In [None]:
import matplotlib.pyplot as plt
# plt.imshow(train_pred_dist.covariance_matrix.detach().cpu().numpy()[0:20, 0:20], cmap='viridis')
plt.imshow(train_pred_dist.covariance_matrix.detach().cpu().numpy(), cmap='viridis')

In [None]:
# plt.imshow(test_pred_dist.covariance_matrix.detach().cpu().numpy()[0:20, 0:20], cmap='viridis')
plt.imshow(test_pred_dist.covariance_matrix.detach().cpu().numpy(), cmap='viridis')

In [None]:
torch.set_printoptions(precision = 7, sci_mode=False)
train_pred_dist.covariance_matrix

In [None]:
test_pred_dist.covariance_matrix - test_pred_dist.covariance_matrix.T

In [None]:
test_pred_dist.covariance_matrix - test_pred_dist.covariance_matrix .T

In [None]:
torch.diag(test_pred_dist_eval.covariance_matrix).min()
torch.diag(train_pred_dist_eval.covariance_matrix).min()

In [None]:
model.base_kernel.lengthscale

In [None]:
torch.diag(dist_test.covariance_matrix).min()

In [None]:
torch.diag(dist_test.covariance_matrix).min()

In [None]:
torch.diag(pred_dist_test.covariance_matrix).min()

In [None]:
# Extract the diagonal (variances at test points)
diagonal_variances = torch.diag(pred_dist_test.covariance_matrix)

# Get the index of the minimum variance
min_index = torch.argmin(diagonal_variances)

x_test[min_index]
y_train[min_index]

y_train[min_index -1]
x_test[min_index ]

In [None]:
torch.diag(test_pred_dist.covariance_matrix)

In [None]:
train_dist_early = train_pred_dist
test_dist_early = test_pred_dist

In [None]:
row_tensor = x_train
column_tensor = x_train

l1, l2 = model.base_kernel.lengthscale[0].to(device), model.base_kernel.lengthscale[1].to(device)

# STEP 1: Pairwise differences of shape [N, M, 2]
# Expand row_tensor [N, 2] -> [N, 1, 2] and column_tensor [M, 2] -> [1, M, 2]
diff = (row_tensor[:, None, :] - column_tensor[None, :, :]).to(device)
# diffs are negative too

# Extract the relative components (columns of diff) for convenience, matching paper notation
r1 = diff[:, :, 0]
r2 = diff[:, :, 1]
# diagonal of r1 and r2 are 0

In [None]:
K_uu # diagonal is 1/l2**2
K_vv # diagonal is 1/l1**2

In [None]:
l1, l2 = model.base_kernel.lengthscale[0].to(device), model.base_kernel.lengthscale[1].to(device)

# STEP 1: Pairwise differences of shape [N, M, 2]
# Expand row_tensor [N, 2] -> [N, 1, 2] and column_tensor [M, 2] -> [1, M, 2]
diff = (row_tensor[:, None, :] - column_tensor[None, :, :]).to(device)

# Extract the relative components (columns of diff) for convenience, matching paper notation
r1 = diff[:, :, 0]
r2 = diff[:, :, 1]
        
# STEP 2: Block matrix

# Block components (shape N × M each)
K_uu = (1 - (r2**2 / l2**2)) / l2**2
K_uv = (r1 * r2) / (l1**2 * l2**2)
K_vu = K_uv  
K_vv = (1 - (r1**2 / l1**2)) / l1**2

In [None]:
train_dist_early.covariance_matrix

In [None]:
torch.diag(train_dist_early.covariance_matrix).min()
torch.diag(test_dist_early.covariance_matrix).min()

exp_term = torch.exp(-0.5 * ((r1 / l1) ** 2 + (r2 / l2) ** 2))

 # Now interleave rows and columns
# Stack into shape (N, M, 2, 2)
K_blocks = torch.stack([
            torch.stack([K_uu, K_uv], dim = -1),
            torch.stack([K_vu, K_vv], dim = -1)
        ], dim = -2)  # shape (N, M, 2, 2)

# HACK: GPytorch needs the interleaved matrix for the Multitask distribution
# Reshape into (2N, 2M) interleaved matrix
K_interleaved = K_blocks.permute(0, 2, 1, 3).reshape(2 * row_tensor.shape[0], 2 * column_tensor.shape[0])

In [None]:
 # Now interleave rows and columns
# Stack into shape (N, M, 2, 2)
K_blocks = torch.stack([
            torch.stack([K_uu, K_uv], dim = -1),
            torch.stack([K_vu, K_vv], dim = -1)
        ], dim = -2)  # shape (N, M, 2, 2)

        # HACK: GPytorch needs the interleaved matrix for the Multitask distribution
        # Reshape into (2N, 2M) interleaved matrix
K_interleaved = K_blocks.permute(0, 2, 1, 3).reshape(2 * row_tensor.shape[0], 2 * column_tensor.shape[0])

In [None]:
K_uu

In [None]:
K_interleaved[0:6, 0:6]

In [None]:
model.covar_module.outputscale.item()

In [None]:
K_interleaved

In [None]:
# Look at the covar
torch.diag(train_dist_early.covariance_matrix).min()

# GP

In [None]:
# REAL DATA EXPERIMENTS
# RUN WITH python run_real_experiments_GP.py
#               _                 _   _      
#              | |               | | (_)     
#    __ _ _ __ | |_ __ _ _ __ ___| |_ _  ___ 
#   / _` | '_ \| __/ _` | '__/ __| __| |/ __|
#  | (_| | | | | || (_| | | | (__| |_| | (__ 
#   \__,_|_| |_|\__\__,_|_|  \___|\__|_|\___|
# 
model_name = "GP"
from gpytorch_models import GP

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
from configs import TRACK_EMISSIONS_BOOL

# Reiterating import for visibility
MAX_NUM_EPOCHS = MAX_NUM_EPOCHS
MAX_NUM_EPOCHS = 100
NUM_RUNS = NUM_RUNS
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_REAL_LEARNING_RATE")
MODEL_REAL_RESULTS_DIR = getattr(configs, f"{model_name}_REAL_RESULTS_DIR")
import os
os.makedirs(MODEL_REAL_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

### START TIMING ###
import time
start_time = time.time()  # Start timing after imports

### START TRACKING EXPERIMENT EMISSIONS ###
if TRACK_EMISSIONS_BOOL:
    from codecarbon import EmissionsTracker
    tracker = EmissionsTracker(project_name = "GP_real_experiments", output_dir = MODEL_REAL_RESULTS_DIR)
    tracker.start()

#############################
### LOOP 1 - over REGIONS ###
#############################

for region_name in ["region_lower_byrd"]:

    print(f"\nTraining for {region_name.upper()}...")

    # Store metrics for the current region (used for *metrics_summary* report and *metrics_per_run*)
    region_results = []

    ##########################################
    ### x_train & y_train, x_test & x_test ###
    ##########################################

    # define paths based on region_name
    path_to_training_tensor = "data/real_data/" + region_name + "_train_tensor.pt"
    path_to_test_tensor = "data/real_data/" + region_name + "_test_tensor.pt"

    # load and tranpose to have rows as points
    train = torch.load(path_to_training_tensor, weights_only = False).T 
    test = torch.load(path_to_test_tensor, weights_only = False).T

    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice flux error in x direction (u_err)
    # [:, 6] = ice flux error in y direction (v_err)
    # [:, 7] = source age

    # train
    x_train = train[:, [0, 1]].to(device)
    y_train = train[:, [3, 4]].to(device)

    # test
    x_test = test[:, [0, 1]].to(device)
    y_test = test[:, [3, 4]].to(device)

    # NOTE: Here we estimate the noise variance 
    """
    ### NOISE MODEL ###
    # TRAIN
    # noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
    noise_var_h_times_uv_train = torch.concat((train[:, 5], train[:, 6]), dim = 0)**2
    # assume age dependent noise sigma_h on ice thickness measurements: ~10 - 20 m std (1000 scaling)
    sigma_h = 0.01 * torch.log(train[:, 7] + 3)
    # calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
    noise_var_uv_times_h_train = (torch.concat((train[:, 3], train[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
    # combine both noise variances into the std for each dimension
    train_noise_diag = torch.sqrt(noise_var_h_times_uv_train + noise_var_uv_times_h_train).to(device)

    # Compute midpoint
    midpoint = train_noise_diag.shape[0] // 2

    # Print noise levels for train, formatted to 4 decimal places
    print(f"Mean noise std per x dimension: {train_noise_diag[:midpoint].mean(dim = 0).item():.4f}")
    print(f"Mean noise std per y dimension: {train_noise_diag[midpoint:].mean(dim = 0).item():.4f}")

    # TEST
    # noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
    noise_var_h_times_uv_test = torch.concat((test[:, 5], test[:, 6]), dim = 0)**2
    # assume age dependent noise sigma_h on ice thickness measurements: ~10 - 20 m std (1000 scaling)
    sigma_h = 0.01 * torch.log(test[:, 7] + 3)
    # calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
    noise_var_uv_times_h_test = (torch.concat((test[:, 3], test[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
    # combine both noise variances into the std for each dimension
    test_noise_diag = torch.sqrt(noise_var_h_times_uv_test + noise_var_uv_times_h_test).to(device)
    """

    # Print train details
    print(f"=== {region_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print()

    # Print test details
    print(f"=== {region_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print()

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the GP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)

        model = GP(
            x_train,
            y_train, 
            likelihood
            ).to(device)
        
        optimizer = torch.optim.AdamW(model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)
        
        # Use ExactMarginalLogLikelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        model.train()
        likelihood.train()
        # _________________
        # BEFORE EPOCH LOOP
        
        # Export the convergence just for first run only
        if run == 0:
            # initialise tensors to store losses over epochs (for convergence plot)
            train_losses_NLML_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # objective
            train_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # by-product
            # monitor performance transfer to test (only RMSE easy to calc without covar)
            test_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

            # NOTE: Here, we estimate the noise
            l1_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            l2_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            Buu_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            Buv_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            Bvu_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            Bvv_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            noise_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
            # Train on noisy or targets
            # NOTE: We only have observational y_train i.e. noisy data
            loss = - mll(train_pred_dist, y_train.to(device))  # negative marginal log likelihood
            loss.backward()
            optimizer.step()

            # For Run 1 we save a bunch of metrics and update, while for the rest we only update
            if run == 0:

                model.eval()
                likelihood.eval()

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
                    train_pred_dist_eval = model(x_train.to(device))
                test_pred_dist = model(x_test.to(device))

                # Compute RMSE for training and test predictions (given true data, not noisy)
                train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(train_pred_dist_eval, y_train.to(device)).mean())
                test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(test_pred_dist, y_test.to(device)).mean())

                # Save losses for convergence plot
                train_losses_NLML_over_epochs[epoch] = loss.item()
                train_losses_RMSE_over_epochs[epoch] = train_RMSE.item()
                test_losses_RMSE_over_epochs[epoch] = test_RMSE.item()

                # Save evolution of hypers for convergence plot
                # NOTE: This is different to dfGPs
                l1_over_epochs[epoch] = model.covar_module.data_covar_module.lengthscale[0, 0].item()
                l2_over_epochs[epoch] = model.covar_module.data_covar_module.lengthscale[0, 1].item()

                # Reconstruct B first via FF.T + D where F is the covar_factor and D is the diagonal matrix of task variances var
                B = model.covar_module.task_covar_module.covar_factor @ model.covar_module.task_covar_module.covar_factor.T + torch.diag(model.covar_module.task_covar_module.var)
                # Extract items
                Buu_over_epochs[epoch] = B[0, 0].item()
                Buv_over_epochs[epoch] = B[0, 1].item()
                Bvu_over_epochs[epoch] = B[1, 0].item()
                Bvv_over_epochs[epoch] = B[1, 1].item()

                noise_var_over_epochs[epoch] = model.likelihood.noise.item()

                # Print a bit more information for the first run
                if epoch % 20 == 0:
                    print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}, Training RMSE: {train_RMSE:.4f}")

                # delete after printing and saving
                # NOTE: keep loss for early stopping check
                # del train_pred_dist, test_pred_dist, train_RMSE, test_RMSE
                
                # Free up memory every 20 epochs
                if epoch % 20 == 0:
                    gc.collect() and torch.cuda.empty_cache()
            
            # For all runs after the first we run a minimal version using only lml_train
            else:

                if epoch % 20 == 0:
                    # After run 1 we only print lml, nothing else
                    print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}")
                
            # EVERY EPOCH: Early stopping check
            if loss < best_loss:
                best_loss = loss
                # reset counter if loss improves
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                # exit epoch loop
                break

        ##############################
        ### END LOOP 3 over EPOCHS ###
        ##############################

        # for every run...
        #######################################################
        ### EVALUATE after all training for RUN is finished ###
        #######################################################

        model.eval()
        likelihood.eval()

        # Need gradients for autograd divergence: We clone and detach
        x_test_grad = x_test.to(device).clone().requires_grad_(True)
        x_train_grad = x_train.to(device).clone().requires_grad_(True)

        # Underlying (latent) distribution and predictive distribution
        dist_test = model(x_test_grad)
        pred_dist_test = likelihood(dist_test)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
            dist_train = model(x_train_grad)
            pred_dist_train = likelihood(dist_train)
        
        # Compute divergence field (from latent distribution)
        test_div_field = compute_divergence_field(dist_test.mean, x_test_grad)
        train_div_field = compute_divergence_field(dist_train.mean, x_train_grad)

        # Only save mean_pred, covar_pred and divergence fields for the first run
        if run == 0:

            # (1) Save predictions from first run so we can visualise them later
            torch.save(pred_dist_test.mean, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_test_mean_predictions.pt")
            torch.save(pred_dist_test.covariance_matrix, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_test_covar_predictions.pt")

            # (2) Save divergence field
            torch.save(test_div_field, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_test_prediction_divergence_field.pt")

            # (3) Since all epoch training is finished, we can save the losses over epochs
            df_losses = pd.DataFrame({
                'Epoch': list(range(train_losses_NLML_over_epochs.shape[0])), # pythonic indexing
                'Train NLML': train_losses_NLML_over_epochs.tolist(),
                'Train RMSE': train_losses_RMSE_over_epochs.tolist(),
                'Test RMSE': test_losses_RMSE_over_epochs.tolist(),
                # hyperparameters
                'l1': l1_over_epochs.tolist(),
                'l2': l2_over_epochs.tolist(),
                'Buu': Buu_over_epochs.tolist(),
                'Buv': Buv_over_epochs.tolist(),
                'Bvu': Bvu_over_epochs.tolist(),
                'Bvv': Bvv_over_epochs.tolist(),
                'noise_var': noise_var_over_epochs.tolist(),
                })
            
            df_losses.to_csv(f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_losses_over_epochs.csv", index = False, float_format = "%.5f") # reduce to 5 decimals for readability

        # Compute TRAIN metrics (convert tensors to float) for every run's tuned model
        # NOTE: gpytorch outputs metrics per task
        train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_train, y_train.to(device)).mean()).item()
        train_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_train, y_train.to(device)).mean().item()
        train_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_train, y_train.to(device)).item()
        train_QCE = quantile_coverage_error_2d(
            pred_dist_train, y_train.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        train_MAD = train_div_field.abs().mean().item()

        # Compute TEST metrics (convert tensors to float) for every run's tuned model
        test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_test, y_test.to(device)).mean()).item()
        test_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_test, y_test.to(device)).mean().item()
        test_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_test, y_test.to(device)).item()
        test_QCE = quantile_coverage_error_2d(
            pred_dist_test, y_test.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        test_MAD = test_div_field.abs().mean().item()

In [None]:
torch.set_printoptions(precision = 7, sci_mode=False)
(pred_dist_test.covariance_matrix - pred_dist_test.covariance_matrix.T)

In [None]:
print(train_pred_dist._interleaved)
print(train_pred_dist_eval._interleaved)

import matplotlib.pyplot as plt
# plt.imshow(train_pred_dist.covariance_matrix.detach().cpu().numpy()[0:20, 0:20], cmap = 'viridis')
plt.imshow(train_pred_dist_eval.covariance_matrix.detach().cpu().numpy()[0:20, 0:20], cmap='viridis')

In [None]:
# Interleave

In [None]:
import torch
import matplotlib.pyplot as plt
import gpytorch

mean_u = torch.linspace(0, 7, 8).unsqueeze(-1)
mean_v = torch.linspace(10, 17, 8).unsqueeze(-1)

mean_2d = torch.cat((mean_u, mean_v), dim = 1)

diff = (mean_2d[:, None, :] - mean_2d[None, :, :]).square()
r1 = torch.exp(- diff[:, :, 0])
r2 = torch.exp(- diff[:, :, 1])

K_uu = 0.5 * r1
K_vv = 0.8 * r2
K_uv = 0.1 * r1 * r2
K_vu = K_uv

K_block = [[K_uu, K_uv], [K_vu, K_vv]]

K_upper = torch.cat((K_uu, K_uv), dim = 0)
K_lower = torch.cat((K_vu, K_vv), dim = 0)
K_block = torch.cat((K_upper, K_lower), dim = 1)

plt.imshow(K_block, cmap = 'viridis')

dist_2d = gpytorch.distributions.MultitaskMultivariateNormal(
    mean = mean_2d,
    covariance_matrix = K_block,
    interleaved = False
)

print(dist_2d._interleaved)
print(dist_2d.mean)

plt.imshow(dist_2d.covariance_matrix.detach().cpu().numpy(), cmap = 'viridis')

mean_mvn = mean_2d.transpose(-1, -2).reshape(*mean_2d.shape[:-2], -1)
mean_mvn

print(dist_2d.mean.shape) 
print(dist_2d._output_shape)