In [6]:
# REAL DATA EXPERIMENTS
# RUN WITH python run_real_experiments_dfNGP.py
#               _                 _   _      
#              | |               | | (_)     
#    __ _ _ __ | |_ __ _ _ __ ___| |_ _  ___ 
#   / _` | '_ \| __/ _` | '__/ __| __| |/ __|
#  | (_| | | | | || (_| | | | (__| |_| | (__ 
#   \__,_|_| |_|\__\__,_|_|  \___|\__|_|\___|
# 
model_name = "dfNGP"
from gpytorch_models import dfNGP

# import configs to we can access the hypers with getattr
import configs
from configs import PATIENCE, MAX_NUM_EPOCHS, NUM_RUNS, WEIGHT_DECAY
from configs import TRACK_EMISSIONS_BOOL
from configs import SCALE_INPUT_region_lower_byrd, SCALE_INPUT_region_mid_byrd, SCALE_INPUT_region_upper_byrd
from configs import REAL_L_RANGE, REAL_NOISE_VAR_RANGE, REAL_OUTPUTSCALE_VAR_RANGE

SCALE_INPUT = {
    "region_lower_byrd": SCALE_INPUT_region_lower_byrd,
    "region_mid_byrd": SCALE_INPUT_region_mid_byrd,
    "region_upper_byrd": SCALE_INPUT_region_upper_byrd,
}

# Reiterating import for visibility
MAX_NUM_EPOCHS = MAX_NUM_EPOCHS
# MAX_NUM_EPOCHS = 4200
NUM_RUNS = NUM_RUNS
NUM_RUNS = 1
WEIGHT_DECAY = WEIGHT_DECAY
PATIENCE = PATIENCE

# assign model-specific variable
MODEL_LEARNING_RATE = getattr(configs, f"{model_name}_REAL_LEARNING_RATE")
MODEL_REAL_RESULTS_DIR = getattr(configs, f"{model_name}_REAL_RESULTS_DIR")
import os
os.makedirs(MODEL_REAL_RESULTS_DIR, exist_ok = True)

# basics
import pandas as pd
import torch
import gpytorch

# universals 
from metrics import compute_divergence_field, quantile_coverage_error_2d
from utils import set_seed, make_grid
import gc
import warnings
set_seed(42)

# setting device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# overwrite if needed: # device = 'cpu'
print('Using device:', device)
print()

#############################
### LOOP 1 - over REGIONS ###
#############################

for region_name in ["region_lower_byrd", "region_mid_byrd", "region_upper_byrd"]:
    SCALE_DOMAIN = SCALE_INPUT[region_name]

    print(f"\nTraining for {region_name.upper()}...")

    # Store metrics for the current region (used for *metrics_summary* report and *metrics_per_run*)
    region_results = []

    ##########################################
    ### x_train & y_train, x_test & x_test ###
    ##########################################

    # define paths based on region_name
    path_to_training_tensor = "data/real_data/" + region_name + "_train_tensor.pt"
    path_to_test_tensor = "data/real_data/" + region_name + "_test_tensor.pt"

    # load and tranpose to have rows as points
    train = torch.load(path_to_training_tensor, weights_only = False).T 
    test = torch.load(path_to_test_tensor, weights_only = False).T

    # The train and test tensors have the following columns:
    # [:, 0] = x
    # [:, 1] = y
    # [:, 2] = surface elevation (s)
    # [:, 3] = ice flux in x direction (u)
    # [:, 4] = ice flux in y direction (v)
    # [:, 5] = ice velocity error in x direction (u_err)
    # [:, 6] = ice velocity error in y direction (v_err)
    # [:, 7] = ice velocity in x direction (u)
    # [:, 8] = ice velocity in y direction (v)
    # [:, 9] = thickness
    # [:, 10] = source age
    # [:, 11] = sqrt flux scale (used for scaling the fluxes)

    # train
    x_train = train[:, [0, 1]].to(device)
    y_train = train[:, [3, 4]].to(device)

    # test
    x_test = test[:, [0, 1]].to(device)
    y_test = test[:, [3, 4]].to(device)

    # HACK: Scaling helps with numerical stability
    # Units are not in km 
    x_test = x_test * SCALE_DOMAIN
    x_train = x_train * SCALE_DOMAIN

    # NOTE: Here we estimate the noise variance 

    # Print train details
    print(f"=== {region_name.upper()} ===")
    print(f"Training inputs shape: {x_train.shape}")
    print(f"Training observations shape: {y_train.shape}")
    print(f"Training inputs dtype: {x_train.dtype}")
    print()

    # Print test details
    print(f"=== {region_name.upper()} ===")
    print(f"Test inputs shape: {x_test.shape}")
    print(f"Test observations shape: {y_test.shape}")
    print(f"Test inputs dtype: {x_test.dtype}")
    print()

    ##################################
    ### LOOP 2 - over training run ###
    ##################################

    # NOTE: GPs don't train on batches, use full data

    for run in range(NUM_RUNS):

        print(f"\n--- Training Run {run + 1}/{NUM_RUNS} ---")

        # Initialise the likelihood for the GP model (estimates noise)
        # NOTE: we use a multitask likelihood for the dfNGP model but with a global noise term
        likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
            num_tasks = 2,
            has_global_noise = True, 
            has_task_noise = False, # HACK: This still needs to be manually turned off
            ).to(device)

        # NOTE: This was needed
        x_train = x_train.clone().detach().requires_grad_(True)

        model = dfNGP(
            x_train,
            y_train, 
            likelihood
            ).to(device)
        
        ### REGISTER PRIORS & CONSTRAINTS ###
        # PRIOR: outputscale variance
        outputscale_prior = gpytorch.priors.SmoothedBoxPrior(
            REAL_OUTPUTSCALE_VAR_RANGE[0], REAL_OUTPUTSCALE_VAR_RANGE[1]).to(device)
        
        model.covar_module.register_prior(
            "outputscale_prior",
            outputscale_prior,
            "raw_outputscale"
        )

        # CONSTRAINT: Domain-informed noise variance constraint
        model.likelihood.register_constraint(
            "raw_noise", gpytorch.constraints.Interval(REAL_NOISE_VAR_RANGE[0], REAL_NOISE_VAR_RANGE[1])
        )
        
        ### INITIALISE HYPERPARAMETERS ###
        # Overwrite default lengthscale hyperparameter initialisation with REAL data lengthscale range init
        model.base_kernel.lengthscale = torch.empty([1, 2], device = device).uniform_( * REAL_L_RANGE)

        # Overwrite default outputscale variance initialisation with sample from prior
        outputscale_sample = outputscale_prior.sample().to(device)
        model.covar_module.outputscale = outputscale_sample
        
        # Overwrite default noise variance initialisation with REAL data noise range init
        model.likelihood.noise = torch.empty(1, device = device).uniform_( * REAL_NOISE_VAR_RANGE)
        
        # NOTE: This part is different from dfGP
        optimizer = torch.optim.AdamW([
            {"params": model.mean_module.parameters(), 
             "weight_decay": WEIGHT_DECAY, "lr": 0.001},
            {"params": list(model.covar_module.parameters()) + list(model.likelihood.parameters()), 
             "weight_decay": WEIGHT_DECAY, "lr": 0.005},
            ])
        
        # Use ExactMarginalLogLikelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        model.train()
        likelihood.train()
        # _________________
        # BEFORE EPOCH LOOP
        
        # Early stopping variables
        best_loss = float('inf')
        # counter starts at 0
        epochs_no_improve = 0

        ############################
        ### LOOP 3 - over EPOCHS ###
        ############################
        print("\nStart Training")

        for epoch in range(MAX_NUM_EPOCHS):

            # Set to train
            model.train()
            likelihood.train()

            # Do a step
            optimizer.zero_grad()

            x_train = x_train.clone().detach().requires_grad_(True)
            # model outputs a multivariate normal distribution
            train_pred_dist = model(x_train.to(device))
            # Train on noisy or targets
            # NOTE: We only have observational y_train i.e. noisy data
            loss = - mll(train_pred_dist, y_train.to(device))  # negative marginal log likelihood
            loss.backward()
            optimizer.step()
            
            # For all runs after the first we run a minimal version using only lml_train

            if epoch % 20 == 0:
                # After run 1 we only print lml, nothing else
                print(f"{region_name} {model_name} Run {run + 1}/{NUM_RUNS}, Epoch {epoch + 1}/{MAX_NUM_EPOCHS}, Training Loss (NLML): {loss:.4f}")
                
            # EVERY EPOCH: Early stopping check
            if loss < best_loss:
                best_loss = loss
                # reset counter if loss improves
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                # exit epoch loop
                break

        ##############################
        ### END LOOP 3 over EPOCHS ###
        ##############################

        # for every run...
        #######################################################
        ### EVALUATE after all training for RUN is finished ###
        #######################################################

        model.eval()
        likelihood.eval()

        ### --- dfNGP only: grid inference --- ###

        if run == 0:

            _, x_grid = make_grid(n_side = 30) 
            x_grid = x_grid * SCALE_DOMAIN # scale grid to match training data
            x_grid.requires_grad_(True) # need gradients for divergence field

            dist_grid = model(x_grid.to(device))
            pred_dist_grid = likelihood(dist_grid)

            torch.save(pred_dist_grid.mean, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_grid_mean_predictions.pt")
            torch.save(pred_dist_grid.covariance_matrix, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_grid_covar_predictions.pt")
            torch.save(dist_grid.covariance_matrix, f"{MODEL_REAL_RESULTS_DIR}/{region_name}_{model_name}_grid_latent_covar_predictions.pt")



        ### ---------------------------------- ###

        # Need gradients for autograd divergence: We clone and detach
        x_test_grad = x_test.to(device).clone().requires_grad_(True)
        x_train_grad = x_train.to(device).clone().requires_grad_(True)

        # Underlying (latent) distribution and predictive distribution
        dist_test = model(x_test_grad)
        pred_dist_test = likelihood(dist_test)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", gpytorch.utils.warnings.GPInputWarning)
            dist_train = model(x_train_grad)
            pred_dist_train = likelihood(dist_train)
        
        # Compute divergence field (from latent distribution)
        test_div_field = compute_divergence_field(dist_test.mean, x_test_grad)
        train_div_field = compute_divergence_field(dist_train.mean, x_train_grad)

        # Only save mean_pred, covar_pred and divergence fields for the first run

        # Compute TRAIN metrics (convert tensors to float) for every run's tuned model
        # NOTE: gpytorch outputs metrics per task
        train_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_train, y_train.to(device)).mean()).item()
        train_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_train, y_train.to(device)).mean().item()
        train_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_train, y_train.to(device)).item()
        train_QCE = quantile_coverage_error_2d(
            pred_dist_train, y_train.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        train_MAD = train_div_field.abs().mean().item()

        # Compute TEST metrics (convert tensors to float) for every run's tuned model
        test_RMSE = torch.sqrt(gpytorch.metrics.mean_squared_error(
            pred_dist_test, y_test.to(device)).mean()).item()
        test_MAE = gpytorch.metrics.mean_absolute_error(
            pred_dist_test, y_test.to(device)).mean().item()
        test_NLL = gpytorch.metrics.negative_log_predictive_density(
            pred_dist_test, y_test.to(device)).item()
        test_QCE = quantile_coverage_error_2d(
            pred_dist_test, y_test.to(device), quantile = 95.0).item()
        ## NOTE: It is important to use the absolute value of the divergence field, since both positive and negative deviations are violations and shouldn't cancel each other out 
        test_MAD = test_div_field.abs().mean().item()

        region_results.append([
            run + 1,
            train_RMSE, train_MAE, train_NLL, train_QCE, train_MAD,
            test_RMSE, test_MAE, test_NLL, test_QCE, test_MAD
        ])

        print(f"\n{region_name.upper()} {model_name} Run {run + 1}/{NUM_RUNS} Metrics:")
        print(f"Train RMSE: {train_RMSE:.5f}, Train MAE: {train_MAE:.5f}, Train NLL: {train_NLL:.5f}, Train QCE: {train_QCE:.5f}, Train MAD: {train_MAD:.5f}")
        print(f"Test RMSE: {test_RMSE:.5f}, Test MAE: {test_MAE:.5f}, Test NLL: {test_NLL:.5f}, Test QCE: {test_QCE:.5f}, Test MAD: {test_MAD:.5f}")

###############################
### END LOOP 1 over REGIONS ###
###############################

Using device: cuda


Training for REGION_LOWER_BYRD...
=== REGION_LOWER_BYRD ===
Training inputs shape: torch.Size([487, 2])
Training observations shape: torch.Size([487, 2])
Training inputs dtype: torch.float32

=== REGION_LOWER_BYRD ===
Test inputs shape: torch.Size([330, 2])
Test observations shape: torch.Size([330, 2])
Test inputs dtype: torch.float32


--- Training Run 1/1 ---

Start Training
region_lower_byrd dfNGP Run 1/1, Epoch 1/2000, Training Loss (NLML): 4.9580
region_lower_byrd dfNGP Run 1/1, Epoch 21/2000, Training Loss (NLML): 4.1327
region_lower_byrd dfNGP Run 1/1, Epoch 41/2000, Training Loss (NLML): 3.3656
region_lower_byrd dfNGP Run 1/1, Epoch 61/2000, Training Loss (NLML): 2.6027
region_lower_byrd dfNGP Run 1/1, Epoch 81/2000, Training Loss (NLML): 1.9926
region_lower_byrd dfNGP Run 1/1, Epoch 101/2000, Training Loss (NLML): 1.6149
region_lower_byrd dfNGP Run 1/1, Epoch 121/2000, Training Loss (NLML): 1.3365
region_lower_byrd dfNGP Run 1/1, Epoch 141/2000, Training L