# Start with dfGP for real data

In [1]:
# REAL DATA EXPERIMENTS
# RUN WITH python run_real_experiments_dfGP.py
#               _                 _   _      
#              | |               | | (_)     
#    __ _ _ __ | |_ __ _ _ __ ___| |_ _  ___ 
#   / _` | '_ \| __/ _` | '__/ __| __| |/ __|
#  | (_| | | | | || (_| | | | (__| |_| | (__ 
#   \__,_|_| |_|\__\__,_|_|  \___|\__|_|\___|
# 
model_name = "dfGP"
from gpytorch_models import dfGP

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
from configs import TRACK_EMISSIONS_BOOL
from configs import REAL_L_RANGE, REAL_OUTPUTSCALE_VAR_RANGE, REAL_NOISE_VAR_RANGE
from configs import SCALE_INPUT_region_lower_byrd, SCALE_INPUT_region_mid_byrd, SCALE_INPUT_region_upper_byrd
from configs import REAL_L_RANGE, REAL_NOISE_VAR_RANGE, REAL_OUTPUTSCALE_VAR_RANGE

SCALE_INPUT = {
    "region_lower_byrd": SCALE_INPUT_region_lower_byrd,
    "region_mid_byrd": SCALE_INPUT_region_mid_byrd,
    "region_upper_byrd": SCALE_INPUT_region_upper_byrd,
}

# Reiterating import for visibility
MAX_NUM_EPOCHS = 1
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_REAL_LEARNING_RATE")
MODEL_REAL_RESULTS_DIR = getattr(configs, f"{model_name}_REAL_RESULTS_DIR")
import os
os.makedirs(MODEL_REAL_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

### START TIMING ###
import time
start_time = time.time()  # Start timing after imports

### START TRACKING EXPERIMENT EMISSIONS ###
if TRACK_EMISSIONS_BOOL:
    from codecarbon import EmissionsTracker
    tracker = EmissionsTracker(project_name = "dfGP_real_experiments", output_dir = MODEL_REAL_RESULTS_DIR)
    tracker.start()

#############################
### LOOP 1 - over REGIONS ###
#############################

for region_name in ["region_lower_byrd", "region_mid_byrd", "region_upper_byrd"]:

    SCALE_DOMAIN = SCALE_INPUT[region_name]

    print(f"\nTraining for {region_name.upper()}...")

    # Store metrics for the current region (used for *metrics_summary* report and *metrics_per_run*)
    region_results = []

    ##########################################
    ### x_train & y_train, x_test & x_test ###
    ##########################################

    # define paths based on region_name
    path_to_training_tensor = "data/real_data/" + region_name + "_train_tensor.pt"
    path_to_test_tensor = "data/real_data/" + region_name + "_test_tensor.pt"

    # load and tranpose to have rows as points
    train = torch.load(path_to_training_tensor, weights_only = False).T 
    test = torch.load(path_to_test_tensor, weights_only = False).T

    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice velocity error in x direction (u_err)
    # [:, 6] = ice velocity error in y direction (v_err)
    # [:, 7] = ice velocity in x direction (u)
    # [:, 8] = ice velocity in y direction (v)
    # [:, 9] = thickness
    # [:, 10] = source age
    # [:, 11] = sqrt flux scale (used for scaling the fluxes)

    # train
    x_train = train[:, [0, 1]].to(device)
    y_train = train[:, [3, 4]].to(device)

    # test
    x_test = test[:, [0, 1]].to(device)
    y_test = test[:, [3, 4]].to(device)

    # HACK: Coordinate scaling helps with numerical stability
    # Units are now in km 
    x_test = x_test * SCALE_DOMAIN
    x_train = x_train * SCALE_DOMAIN

    ### NOISE MODEL ###
    # TRAIN
    # noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
    noise_var_h_times_uv_train = torch.concat((train[:, 5], train[:, 6]), dim = 0)**2
    # assume age dependent noise sigma_h on ice thickness measurements: ~10 - 20 m std (1000 scaling)
    sigma_h = 0.01 * torch.log(train[:, 7] + 3)
    # calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
    noise_var_uv_times_h_train = (torch.concat((train[:, 3], train[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
    # combine both noise variances into the std for each dimension
    train_noise_diag = torch.sqrt(noise_var_h_times_uv_train + noise_var_uv_times_h_train).to(device)

    # Compute midpoint
    midpoint = train_noise_diag.shape[0] // 2

    # Print noise levels for train, formatted to 4 decimal places
    print(f"Mean noise std per x dimension: {train_noise_diag[:midpoint].mean(dim = 0).item():.4f}")
    print(f"Mean noise std per y dimension: {train_noise_diag[midpoint:].mean(dim = 0).item():.4f}")

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the dfGP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)

        model = dfGP(
            x_train,
            y_train, 
            likelihood
            ).to(device)
        
        model.base_kernel.lengthscale = torch.tensor([[5.0, 8.0]]).to(device)
        
        optimizer = torch.optim.AdamW(model.parameters(), lr = MODEL_LEARNING_RATE, weight_decay = WEIGHT_DECAY)
        
        # Use ExactMarginalLogLikelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        model.train()
        likelihood.train()
        # _________________
        # BEFORE EPOCH LOOP
        
        # Export the convergence just for first run only
        if run == 0:
            # initialise tensors to store losses over epochs (for convergence plot)
            train_losses_NLML_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # objective
            train_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS) # by-product
            # monitor performance transfer to test (only RMSE easy to calc without covar)
            test_losses_RMSE_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

            # NOTE: Here, we estimate the noise
            l1_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            l2_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            outputscale_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)
            noise_var_over_epochs = torch.zeros(MAX_NUM_EPOCHS)

        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
            # Train on noisy or targets
            # NOTE: We only have observational y_train i.e. noisy data
            loss = - mll(train_pred_dist, y_train.to(device))  # negative marginal log likelihood
            loss.backward()
            optimizer.step()

            # For Run 1 we save a bunch of metrics and update, while for the rest we only update
            if run == 0:

                model.eval()
                likelihood.eval()

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
                    train_pred_dist = model(x_train.to(device))
                test_pred_dist = model(x_test.to(device))

                # Compute RMSE for training and test predictions (given true data, not noisy)
                train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(train_pred_dist, y_train.to(device)).mean())
                test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(test_pred_dist, y_test.to(device)).mean())

                # Save losses for convergence plot
                train_losses_NLML_over_epochs[epoch] = loss.item()
                train_losses_RMSE_over_epochs[epoch] = train_RMSE.item()
                test_losses_RMSE_over_epochs[epoch] = test_RMSE.item()

                # Save evolution of hypers for convergence plot
                # NOTE: lengthscale is [1, 2] in shape
                l1_over_epochs[epoch] = model.base_kernel.lengthscale[:, 0].item()
                l2_over_epochs[epoch] = model.base_kernel.lengthscale[:, 1].item()
                outputscale_var_over_epochs[epoch] = model.covar_module.outputscale.item()
                noise_var_over_epochs[epoch] = model.likelihood.noise.item()

                # Print a bit more information for the first run
                if epoch % 20 == 0:
                    print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}, Training RMSE: {train_RMSE:.4f}")

                # delete after printing and saving
                # NOTE: keep loss for early stopping check
                del train_pred_dist, test_pred_dist, train_RMSE, test_RMSE
                
                # Free up memory every 20 epochs
                if epoch % 20 == 0:
                    gc.collect() and torch.cuda.empty_cache()
            
            # For all runs after the first we run a minimal version using only lml_train
            else:

                if epoch % 20 == 0:
                    # After run 1 we only print lml, nothing else
                    print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}")
                
            # EVERY EPOCH: Early stopping check
            if loss < best_loss:
                best_loss = loss
                # reset counter if loss improves
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                # exit epoch loop
                break

        ##############################
        ### END LOOP 3 over EPOCHS ###
        ##############################

        # for every run...
        #######################################################
        ### EVALUATE after all training for RUN is finished ###
        #######################################################

        model.eval()
        likelihood.eval()

        # Need gradients for autograd divergence: We clone and detach
        x_test_grad = x_test.to(device).clone().requires_grad_(True)
        x_train_grad = x_train.to(device).clone().requires_grad_(True)

        # Underlying (latent) distribution and predictive distribution
        dist_test = model(x_test_grad)
        pred_dist_test = likelihood(dist_test)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
            dist_train = model(x_train_grad)
            pred_dist_train = likelihood(dist_train)
        
        # Compute divergence field (from latent distribution)
        test_div_field = compute_divergence_field(dist_test.mean, x_test_grad)
        train_div_field = compute_divergence_field(dist_train.mean, x_train_grad)

        # Only save mean_pred, covar_pred and divergence fields for the first run
        if run == 0:

            # (3) Since all epoch training is finished, we can save the losses over epochs
            df_losses = pd.DataFrame({
                'Epoch': list(range(train_losses_NLML_over_epochs.shape[0])), # pythonic indexing
                'Train NLML': train_losses_NLML_over_epochs.tolist(),
                'Train RMSE': train_losses_RMSE_over_epochs.tolist(),
                'Test RMSE': test_losses_RMSE_over_epochs.tolist(),
                # hyperparameters
                'l1': l1_over_epochs.tolist(),
                'l2': l2_over_epochs.tolist(),
                'outputscale_var': outputscale_var_over_epochs.tolist(),
                'noise_var': noise_var_over_epochs.tolist(),
                })
    

        # Compute TRAIN metrics (convert tensors to float) for every run's tuned model
        # NOTE: gpytorch outputs metrics per task
        train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_train, y_train.to(device)).mean()).item()
        train_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_train, y_train.to(device)).mean().item()
        train_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_train, y_train.to(device)).item()
        train_QCE = quantile_coverage_error_2d(
            pred_dist_train, y_train.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        train_MAD = train_div_field.abs().mean().item()

        # Compute TEST metrics (convert tensors to float) for every run's tuned model
        test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_test, y_test.to(device)).mean()).item()
        test_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_test, y_test.to(device)).mean().item()
        test_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_test, y_test.to(device)).item()
        test_QCE = quantile_coverage_error_2d(
            pred_dist_test, y_test.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        test_MAD = test_div_field.abs().mean().item()

        region_results.append([
            run + 1,
            train_RMSE, train_MAE, train_NLL, train_QCE, train_MAD,
            test_RMSE, test_MAE, test_NLL, test_QCE, test_MAD
        ])

        # clean up
        del dist_train, dist_test, pred_dist_train, pred_dist_test, test_div_field, train_div_field
        gc.collect()
        torch.cuda.empty_cache()

    ############################
    ### END LOOP 2 over RUNS ###
    ############################

    print(f"Test RMSE: {test_RMSE:.4f}, Test NLL: {test_NLL:.4f}")

Using device: cuda


Training for REGION_LOWER_BYRD...
Mean noise std per x dimension: 0.0132
Mean noise std per y dimension: 0.0169

--- Training Run 1/1 ---

Start Training
region_lower_byrd dfGP Run 1/1, Epoch 1/1, Training Loss (NLML): 9.3248, Training RMSE: 0.2575




NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-04.

In [None]:
    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice velocity error in x direction (u_err)
    # [:, 6] = ice velocity error in y direction (v_err)
    # [:, 7] = ice velocity in x direction (u)
    # [:, 8] = ice velocity in y direction (v)
    # [:, 9] = thickness
    # [:, 10] = source age
    # [:, 11] = sqrt flux scale (used for scaling the fluxes)

In [4]:
factor = 7.5 / train[:, 11]
# factor = 7.5 / train[:, 11]
sigma_t = torch.cat([
    factor * torch.log(train[:, 10] + 3),
    factor * torch.log(train[:, 10] + 3)
], dim = 0)

sigma_t * train[:, 11]

RuntimeError: The size of tensor a (1072) must match the size of tensor b (536) at non-singleton dimension 0

In [9]:
factor * torch.log(train[:, 10] + 3)

tensor([0.0606, 0.0606, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430,
        0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430,
        0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430,
        0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430,
        0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430,
        0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0430, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368, 0.0368,
        0.0368, 0.0368, 0.0368, 0.0368, 

In [21]:
### NOISE MODEL ###
# Thickness^2 * error_uv^2 (t^2 * sigma_u^2, t^2 * sigma_v^2)
noise_var_t_sq_times_uv_var = torch.cat([
    (train[:, 9]**2 * train[:, 5]**2),
    (train[:, 9]**2 * train[:, 6]**2),
], dim = 0)

# UV^2 * error_thickness^2 (u^2 * sigma_t^2, v^2 * sigma_t^2)
# Calculate the factor for sigma_t (indpendent of scaling)
factor = 7.5 / train[:, 11]
# noise std level: only dependent on age, not depth, abou 15 m std
sigma_t = torch.cat([
    factor * torch.log(train[:, 10] + 3),
    factor * torch.log(train[:, 10] + 3)
], dim = 0)
noise_var_uv_sq_times_t_var = (torch.concat((train[:, 7]**2, train[:, 8]**2), dim = 0) * sigma_t**2)

# Combine via independed error propagation
noise_var = noise_var_t_sq_times_uv_var + noise_var_uv_sq_times_t_var

# Get quantiles for prior
lower_noise_var = torch.quantile(noise_var, 0.05, dim = 0).item()
upper_noise_var = torch.quantile(noise_var, 0.95, dim = 0).item()

print(f"Lower noise var: {lower_noise_var:.4f}")
print(f"Upper noise var: {upper_noise_var:.4f}")

Lower noise var: 0.0000
Upper noise var: 0.0008


In [17]:
(factor * torch.log(train[:, 10] + 3) * train[:, 11]).mean()

tensor(15.7113)

In [None]:
torch.concat((train[:, 7]**2, train[:, 8]**2)

In [None]:
noise_var_uv_sq_times_t_var

In [None]:
### NOISE MODEL ###
# (t^2 * sigma_u^2, t^2 * sigma_v^2)
noise_var_t_sq_times_uv_var = torch.cat((train[:, 9]**2 * train[:, 5])**2, (train[:, 9]**2 * train[:, 6])**2, dim = 0)
# (u^2 * sigma_t^2, v^2 * sigma_t^2)
train[:, 11]
# Calculate the factor for sigma_t
factor = 7.5 / train[:, 11]
sigma_t = torch.cat((train[:, 9] * factor * torch.log(train[:, 7] + 3), train[:, 10] * factor * torch.log(train[:, 7] + 3)), dim = 0)
noise_var_uv_sq_times_t_var = (torch.concat((train[:, 7]**2, train[:, 8]**2), dim = 0) * sigma_t**2)
noise_var = noise_var_t_sq_times_uv_var + noise_var_uv_sq_times_t_var

# noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
noise_var_h_times_uv_train = torch.concat((train[:, 5], train[:, 6]), dim = 0)**2
# assume age dependent noise sigma_h on ice thickness measurements: with 0.1 ~10 - 20 m std (x1000 scaling)
sigma_h = 0.01 * torch.log(train[:, 7] + 3)

# calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
noise_var_uv_times_h_train = (torch.concat((train[:, 3], train[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
# combine noise variances for both dimensions
noise_var = noise_var_h_times_uv_train + noise_var_uv_times_h_train

In [None]:
(train_noise_diag[:midpoint] + train_noise_diag[midpoint:])/2  # This is just to show the concatenation
# torch.mean(train_noise_diag[:midpoint]), torch.mean(train_noise_diag[midpoint:])

In [None]:
### NOISE MODEL ###
# TRAIN
# noise variance (h * sigma_u)^2 and (h * sigma_v)^2 (tensor contains [h sig_u, h sig_v] stds)
noise_var_h_times_uv_train = torch.concat((train[:, 5], train[:, 6]), dim = 0)**2
# assume age dependent noise sigma_h on ice thickness measurements: with 0.1 ~10 - 20 m std (x1000 scaling)
sigma_h = 0.01 * torch.log(train[:, 7] + 3)

# calculate noise variance (u * sigma_h)^2 and (v * sigma_h)^2
noise_var_uv_times_h_train = (torch.concat((train[:, 3], train[:, 4]), dim = 0) * torch.cat([sigma_h, sigma_h]))**2
# combine noise variances for both dimensions
noise_var = noise_var_h_times_uv_train + noise_var_uv_times_h_train

# Print noise levels for train, formatted to 4 decimal places
print(f"Mean noise var: {noise_var.mean(dim = 0).item():.4f}")
print(f"Std noise var: {noise_var.std(dim = 0).item():.4f}")
print(f"Mean noise std: {torch.sqrt(noise_var).mean(dim = 0).item():.4f}")
print(f"Std noise std: {torch.sqrt(noise_var).std(dim = 0).item():.4f}")
lower_noise_var = torch.quantile(noise_var, 0.10, dim = 0).item()
upper_Noise_var = torch.quantile(noise_var, 0.90, dim = 0).item()

print(f"Lower noise var: {lower_noise_var:.4f}")
print(f"Upper noise var: {upper_Noise_var:.4f}")

In [None]:
sigma_h

In [None]:
noise_var
import matplotlib.pyplot as plt
plt.hist(torch.sqrt(noise_var).cpu().numpy(), bins = 100)

In [None]:
noise_var_h_times_uv_train

In [None]:
0.01 * torch.log(train[:, 7] + 3)