# Important Note
RAM and VRAM measurements are dependent on the computer state, and should only be interpreted relative to each other. In order to obtain RAM and VRAM measurements, perform the following steps:

1 - Restart the Kernel

2 - Run the "Loading Required Packages and Helper Functions" cell

3 - Run the "Loading Data" cell

4 - Run ONLY ONE iteration of the desired method, and read the RAM and VRAM usage reports printed by the cell

# Loading Required Packages and Helper Functions
If you would like to use Cuda, set gpu = True. Otherwise set gpu = False. 

Step 1: Run the following cell to import the required packages and helper functions. Set the number of replicates desired.

Step 2: Load the Data

Step 3: Execute the cells under the method you wish to replicate.

# Step 1

In [1]:
import time
import torch
import gpytorch
import pynvml
import psutil
import statistics
from tqdm import trange, tqdm

import torch
import pynvml
import psutil

def log_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)

    max_allocated = torch.cuda.max_memory_allocated() / 1024**2  # MB
    max_reserved = torch.cuda.max_memory_reserved() / 1024**2    # MB
    gpu_used = meminfo.used / 1024**2                            # MB
    sys_used = psutil.virtual_memory().used / 1024**3            # GB
    print(f"[PyTorch] Max Allocated: {max_allocated:.2f} MB | Max Reserved: {max_reserved:.2f} MB")
    print(f"[GPU VRAM] Used (nvidia-smi): {gpu_used:.2f} MB | [System RAM]: {sys_used:.2f} GB")

    return max_allocated, max_reserved, gpu_used, sys_used


In [2]:
gpu = True
n_replicates = 10

import math
import torch
import gpytorch
import time
from matplotlib import pyplot as plt
import gc
import statistics
import numpy as np
import tqdm
import psutil
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy


%matplotlib inline
%load_ext autoreload
%autoreload 2

def get_mem():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss/(1024**2)

max_vram = 0
def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())

from torch.utils.data import TensorDataset, DataLoader
import urllib.request
import os
import pandas as pd
from scipy.io import loadmat
from math import floor
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal
from torch.utils.data import TensorDataset, DataLoader

max_vram = 0
max_ram = 0

def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())


%matplotlib inline
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy

# Loading Data
Step 2: Load the data (note: must run the DataGenerator.Rmd file first)

In [5]:
import os
import numpy as np
import pandas as pd
import torch

gpu = torch.cuda.is_available()
smoke_test = ('CI' in os.environ)

coords_df = pd.read_csv('Data/coordinates.csv')
all_x = torch.tensor(coords_df.values, dtype=torch.float32)

expr_df = pd.read_csv('Data/Mbp.csv')
all_y = torch.tensor(expr_df.iloc[:, 0].values, dtype=torch.float32)


all_x = all_x.contiguous()
all_y = all_y.contiguous()


print("all_x shape:", all_x.shape)
print("all_y shape:", all_y.shape)



all_x shape: torch.Size([393542, 2])
all_y shape: torch.Size([393542])


In [6]:
import torch
import numpy as np

def splitter(x_cpu, y_cpu, n_train=80000, n_test=20000, random_state=42, move_to_gpu=True):
    assert x_cpu.shape[0] == y_cpu.shape[0], "Mismatch in number of samples"
    total_samples = x_cpu.shape[0]
    assert n_train + n_test <= total_samples, "Not enough samples to split"

    # Set random seed
    rng = np.random.default_rng(seed=random_state)

    # Randomly permute indices
    indices = rng.permutation(total_samples)

    # Select subsets
    train_idx = indices[:n_train]
    test_idx  = indices[n_train:n_train + n_test]

    train_x = x_cpu[train_idx].contiguous()
    train_y = y_cpu[train_idx].contiguous()
    test_x  = x_cpu[test_idx].contiguous()
    test_y  = y_cpu[test_idx].contiguous()

    if move_to_gpu and torch.cuda.is_available():
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x = test_x.cuda()
        test_y = test_y.cuda()

    return train_x, train_y, test_x, test_y



In [8]:
import pandas as pd
import torch
import numpy as np


coords_df = pd.read_csv('Data/coordinates.csv')
expr_df = pd.read_csv('Data/Mbp.csv')

all_x = torch.tensor(coords_df.values, dtype=torch.float32).contiguous()
all_y = torch.tensor(expr_df.iloc[:, 0].values, dtype=torch.float32).contiguous()


total_samples = all_x.shape[0]
assert total_samples >= 100_000, "Not enough samples for split"

rng = np.random.default_rng(seed=42)
indices = rng.permutation(total_samples)

train_idx = indices[:80_000]
test_idx  = indices[80_000:100_000]

train_x = all_x[train_idx]
train_y = all_y[train_idx]
test_x  = all_x[test_idx]
test_y  = all_y[test_idx]


if torch.cuda.is_available():
    train_x = train_x.cuda()
    train_y = train_y.cuda()
    test_x = test_x.cuda()
    test_y = test_y.cuda()


print("train_x shape:", train_x.shape)
print("train_y shape:", train_y.shape)
print("test_x shape:", test_x.shape)
print("test_y shape:", test_y.shape)


train_x shape: torch.Size([80000, 2])
train_y shape: torch.Size([80000])
test_x shape: torch.Size([20000, 2])
test_y shape: torch.Size([20000])


# Simulations
Step 3: Execute the simulations to be reproduced. If all simulations are run, there is a summarizer at the end. Otherwise, the relevant statistics are printed at the end of each method.

# Deep Kernel

In [8]:
import time
import os
import torch
import gpytorch
import pynvml
import psutil
import statistics
from tqdm import trange, tqdm

# Memory tracking function from the first prompt
def log_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    
    print(f"[GPU] Used: {meminfo.used / 1024**2:.2f} MB")
    print(f"[PyTorch] Max Allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
    print(f"[System RAM] Used: {psutil.virtual_memory().used / 1024**3:.2f} GB")

# Process memory tracker (RAM used by the model/training)
def get_process_ram_mb():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**2  # in MB

# Hyperparameters
n_replicates = 10
training_iterations = 10
batch_size = 32#00
gpu = torch.cuda.is_available()

# Model definition
class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self, input_dim):
        super().__init__()
        self.add_module('linear1', torch.nn.Linear(input_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 2))

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, feature_extractor):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5)),
            num_dims=2, grid_size=100
        )
        self.feature_extractor = feature_extractor
        self.scale_to_bounds = gpytorch.utils.grid.ScaleToBounds(-1., 1.)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.scale_to_bounds(x)
        return gpytorch.distributions.MultivariateNormal(
            self.mean_module(x), self.covar_module(x)
        )

# Benchmark loop
mse_list, time_list = [], []

for i in range(n_replicates):
    print(f"\n=== Replicate {i + 1}/{n_replicates} ===")
    
    
    
    # Sample new data
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=i)

    # Record initial process RAM usage
    initial_ram = get_mem()
    
    # Model + likelihood reset
    feature_extractor = LargeFeatureExtractor(train_x.size(-1))
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood, feature_extractor)

    if gpu:
        model, likelihood = model.cuda(), likelihood.cuda()
        train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

    model.train(); likelihood.train()

    optimizer = torch.optim.Adam([
        {'params': model.feature_extractor.parameters()},
        {'params': model.covar_module.parameters()},
        {'params': model.mean_module.parameters()},
        {'params': model.likelihood.parameters()},
    ], lr=0.5)

    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Train
    start = time.time()
    pbar = trange(training_iterations, desc=f"Training (Rep {i + 1})", leave=False)
    for j in pbar:
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())
    elapsed = time.time() - start

    final_ram = get_mem()

    # Print memory usage information
    log_memory()
    
    # Record final process RAM usage after training and evaluation
    
    print(f"[RAM Tracker] Model & Training RAM Usage: {final_ram:.2f} MB (Initial: {initial_ram:.2f} MB, Increase: {final_ram - initial_ram:.2f} MB)")

    # Evaluate
    model.eval(); likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu())**2).item()

    # Record results
    mse_list.append(mse)
    time_list.append(elapsed)

    
    # GPU cleanup
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Summary
print(f"\nMSE: {round(statistics.mean(mse_list), 5)} ± {round(statistics.stdev(mse_list), 5)}")
print(f"Time: {round(statistics.mean(time_list), 2)}s ± {round(statistics.stdev(time_list), 2)}s")



=== Replicate 1/10 ===


  if nonzero_indices.storage():
  res = cls(index_tensor, value_tensor, interp_size)
  res = cls(index_tensor, value_tensor, interp_size)
                                                        

KeyboardInterrupt: 

In [None]:
print(f"\nMSE mean: {statistics.mean(mse_list)}")
print(f"MSE std: {statistics.stdev(mse_list)}")

print(f"Time mean: {statistics.mean(time_list)}s")
print(f"Time std: {statistics.stdev(time_list)}s")

mse_mean = statistics.mean(mse_list)
mse_std = statistics.stdev(mse_list)
time_mean = statistics.mean(time_list)
time_std = statistics.stdev(time_list)

print(f"{mse_mean:.5f}  & ({mse_std:.5f})  & {time_mean:.2f}  & ({time_std:.2f})")



# Sparse GP

In [None]:
from tqdm import trange, tqdm

# Hyperparameters
n_replicates = 10
training_iterations = 350
batch_size = 3200
gpu = torch.cuda.is_available()


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean()
        self.base_covar_module = ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2))
        self.covar_module = InducingPointKernel(self.base_covar_module, inducing_points=train_x[torch.randperm(train_x.shape[0])[:100]].clone(), likelihood=likelihood)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        return MultivariateNormal(mean_x, covar_x)
    

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean()
        self.base_covar_module = ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2))
        self.covar_module = InducingPointKernel(self.base_covar_module, inducing_points=train_x[::300].clone(), likelihood=likelihood)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        return MultivariateNormal(mean_x, covar_x)

# Benchmark loop
mse_list, time_list = [], []

for i in range(n_replicates):
    print(f"\n=== Replicate {i + 1}/{n_replicates} ===")

    # Sample new data
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=i)


    initial_ram = get_mem()

    # Model + likelihood reset
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)

    if gpu:
        model, likelihood = model.cuda(), likelihood.cuda()
        train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

    model.train(); likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Train
    start = time.time()
    pbar = trange(training_iterations, desc=f"Training (Rep {i + 1})", leave=False)
    for j in pbar:
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())
        #torch.cuda.empty_cache()
    elapsed = time.time() - start

    final_ram = get_mem()
    log_memory()
    print(f"[RAM Tracker] Model & Training RAM Usage: {final_ram:.2f} MB (Initial: {initial_ram:.2f} MB, Increase: {final_ram - initial_ram:.2f} MB)")

    # Evaluate
    model.eval(); likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu())**2).item()

    # Record results
    mse_list.append(mse)
    time_list.append(elapsed)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Summary
print(f"\nMSE: {round(statistics.mean(mse_list), 5)} ± {round(statistics.stdev(mse_list), 5)}")
print(f"Time: {round(statistics.mean(time_list), 2)}s ± {round(statistics.stdev(time_list), 2)}s")



=== Replicate 1/10 ===


                                                                             

KeyboardInterrupt: 

In [None]:
mse_mean = statistics.mean(mse_list)
mse_std = statistics.stdev(mse_list)
time_mean = statistics.mean(time_list)
time_std = statistics.stdev(time_list)

print(f"{mse_mean:.5f}  & ({mse_std:.5f})  & {time_mean:.2f}  & ({time_std:.2f})")

# LOVE

In [None]:
from tqdm import trange, tqdm

# Hyperparameters
n_replicates = 10
training_iterations = 10
batch_size = 3200
gpu = torch.cuda.is_available()


class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self, input_dim):
        super(LargeFeatureExtractor, self).__init__()
        self.add_module('linear1', torch.nn.Linear(input_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 2))
        print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2), "MB")


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2)
            ),
            grid_size=100, num_dims=2,
        )
        self.feature_extractor = LargeFeatureExtractor(input_dim=train_x.size(-1))

    def forward(self, x):
        projected_x = self.feature_extractor(x)
        projected_x = projected_x - projected_x.min(0)[0]
        projected_x = 2 * (projected_x / projected_x.max(0)[0]) - 1

        mean_x = self.mean_module(projected_x)
        covar_x = self.covar_module(projected_x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


mse_l_love = []
time_l_love = []

for i in range(n_replicates):
    print(f"\n=== Replicate {i + 1}/{n_replicates} ===")

    # Split data
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=i)

    process = psutil.Process()
    ram_before = process.memory_info().rss / (1024 ** 2)
    mem_begin = get_mem()

    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)

    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
        train_x, train_y = train_x.cuda(), train_y.cuda()
        test_x, test_y = test_x.cuda(), test_y.cuda()

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Training loop
    def train():
        pbar = trange(training_iterations, desc=f"Training (Rep {i + 1})", leave=False)
        for _ in pbar:
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()
            vram_usage()
            pbar.set_postfix(loss=loss.item())

    start = time.time()
    train()
    uTime = time.time() - start

    log_memory()

    print("Time:", uTime)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")

    # Evaluation
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        observed_pred = likelihood(model(test_x))
        means = observed_pred.mean.cpu()
        mse = torch.mean((means - test_y.cpu()) ** 2).item()

    mse_l_love.append(mse)
    time_l_love.append(uTime)

    print("Test MSE:", mse)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    ram_after = process.memory_info().rss / (1024 ** 2)
    print("RAM Delta (MB):", ram_after - ram_before)

# Final results summary
print(f"\nMSE: {round(statistics.mean(mse_l_love), 5)} ± {round(statistics.stdev(mse_l_love), 5)}")
print(f"Time: {round(statistics.mean(time_l_love), 2)}s ± {round(statistics.stdev(time_l_love), 2)}s")

# LaTeX-friendly output
print(f"{round(statistics.mean(mse_l_love), 5)}  & ({round(statistics.stdev(mse_l_love), 5)})  & "
      f"{round(statistics.mean(time_l_love), 2)}  & ({round(statistics.stdev(time_l_love), 2)})")


NameError: name 'torch' is not defined

# NGD

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import statistics

my_batch_size = 320
n_replicates = 10
gpu = torch.cuda.is_available()

class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=False
        )
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


mse_l_ngd = []
time_l_ngd = []

for i in range(n_replicates):
    print(f"\n=== Replicate {i + 1}/{n_replicates} ===")

    # Data split
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=i)

    # DataLoaders
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

    test_dataset = TensorDataset(test_x, test_y)
    test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)

    mem_begin = get_mem()

    inducing_points = train_x[::100]
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()

    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
        train_x, train_y = train_x.cuda(), train_y.cuda()
        test_x, test_y = test_x.cuda(), test_y.cuda()

    variational_ngd_optimizer = gpytorch.optim.NGD(
        model.variational_parameters(), num_data=train_y.size(0), lr=0.01
    )

    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.1)

    print("VRAM Usage:", torch.cuda.memory_allocated() / (1024 ** 2), "MB")

    model.train()
    likelihood.train()

    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))

    num_epochs = 5
    epochs_iter = tqdm(range(num_epochs), desc="Epoch")

    start = time.time()

    for epoch in epochs_iter:
        minibatch_iter = tqdm(train_loader, desc="Minibatch", leave=False, position=0)
        for x_batch, y_batch in minibatch_iter:
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            minibatch_iter.set_postfix(loss=loss.item())
            variational_ngd_optimizer.step()
            hyperparameter_optimizer.step()

    uTime = time.time() - start
    print("Time:", uTime)
    mem_diff = get_mem() - mem_begin
    print("Memory Usage:", mem_diff / (1024 ** 2), "MB")
    log_memory()

    # Evaluation
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    mse = torch.mean((means - test_y.cpu()) ** 2).item()

    mse_l_ngd.append(mse)
    time_l_ngd.append(uTime)

    print("Test MSE:", mse)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Summary
print(f"\nMSE: {round(statistics.mean(mse_l_ngd), 5)} ± {round(statistics.stdev(mse_l_ngd), 5)}")
print(f"Time: {round(statistics.mean(time_l_ngd), 2)}s ± {round(statistics.stdev(time_l_ngd), 2)}s")

# LaTeX-style output
print(f"{round(statistics.mean(mse_l_ngd), 5)}  & ({round(statistics.stdev(mse_l_ngd), 5)})  & "
      f"{round(statistics.mean(time_l_ngd), 2)}  & ({round(statistics.stdev(time_l_ngd), 2)})")


# SVGP_CI

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from tqdm import trange, tqdm
import statistics
import time

# Hyperparameters
my_batch_size = 3200
n_replicates = 10
num_epochs = 10
gpu = torch.cuda.is_available()

# Use every 1000-th point as inducing points (will be re-sampled from train_x each replicate)
# Note: 'splitter' should return train_x, train_y, test_x, test_y.
    
class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.CiqVariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2)
        )
        # Specific initialization for the 3droad dataset
        self.covar_module.base_kernel.initialize(lengthscale=0.01)
        print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2), "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


mse_l_svgpci = []
time_l_svgpci = []

for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    
    # Split the data for this replicate
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=rep)
    
    # Create DataLoaders
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)
    
    test_dataset = TensorDataset(test_x, test_y)
    test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)
    
    mem_begin = get_mem()
    
    # Define inducing points from the current training set
    inducing_points = train_x[::1000]
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
        train_x, train_y = train_x.cuda(), train_y.cuda()
        test_x, test_y = test_x.cuda(), test_y.cuda()
    
    # Setup optimizers: one for the variational parameters and one for hyperparameters.
    variational_ngd_optimizer = gpytorch.optim.NGD(
        model.variational_parameters(), num_data=train_y.size(0), lr=0.1
    )
    
    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.002)
    
    model.train()
    likelihood.train()
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    
    # Training loop over epochs with minibatch training
    start = time.time()
    epochs_iter = trange(num_epochs, desc="Epoch")
    for epoch in epochs_iter:
        minibatch_iter = tqdm(train_loader, desc="Minibatch", leave=False, position=0)
        for x_batch, y_batch in minibatch_iter:
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            variational_ngd_optimizer.step()
            vram_usage()
            hyperparameter_optimizer.step()
            minibatch_iter.set_postfix(loss=loss.item())
    
    uTime = time.time() - start
    print("Time:", uTime)
    print("RAM usage:", (get_mem() - mem_begin), "MB")
    log_memory()

    # Evaluation
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu())**2).item()
    
    mse_l_svgpci.append(mse)
    time_l_svgpci.append(uTime)
    
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    print("Test MSE:", mse)

# Summary of results
print(f"\nMSE: {round(statistics.mean(mse_l_svgpci), 5)} ± {round(statistics.stdev(mse_l_svgpci), 5)}")
print(f"Time: {round(statistics.mean(time_l_svgpci), 2)}s ± {round(statistics.stdev(time_l_svgpci), 2)}s")

# LaTeX-friendly output (mean & standard deviation)
print(f"{round(statistics.mean(mse_l_svgpci),5)}  & ({round(statistics.stdev(mse_l_svgpci),5)})  & "
      f"{round(statistics.mean(time_l_svgpci),5)}  & ({round(statistics.stdev(time_l_svgpci),5)})")


# SVGP

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from tqdm import trange, tqdm
import statistics
import time

# Hyperparameters
my_batch_size = 3200
n_replicates = 10
num_epochs = 20
gpu = torch.cuda.is_available()

# Define the GPModel using ApproximateGP with a CholeskyVariationalDistribution
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution, VariationalStrategy

class GPModel(ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
        variational_strategy = VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=False
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))
        # Uncomment these prints if needed:
        # print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        # Uncomment if using VRAM tracking:
        # vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Containers to store the metrics for each replicate
mse_l_svgp = []
time_l_svgp = []

for rep in range(n_replicates):
    print(f"\n=== Replicate: {rep + 1}/{n_replicates} ===")
    
    # Split the data for the current replicate
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=rep)
    
    # Create DataLoaders
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)
    
    test_dataset = TensorDataset(test_x, test_y)
    test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)
    
    # Measure initial RAM
    mem_begin = get_mem()
    
    # Select inducing points from the current training set
    inducing_points = train_x[::100]
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    mem_diff = get_mem() - mem_begin
    print("RAM:", mem_diff / (1024 ** 2), "MB")
    
    # Set up the optimizer and loss (marginal log likelihood)
    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},
        {'params': likelihood.parameters()},
    ], lr=0.001)
    
    # Here we use the exact marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    mem_diff = get_mem() - mem_begin
    print("RAM:", mem_diff / (1024 ** 2), "MB")
    
    # Initialize variables to track maximum RAM and VRAM usage during training
    max_ram = 0
    max_vram = 0
    
    epochs_iter = trange(num_epochs, desc="Epoch")
    start = time.time()
    for epoch in epochs_iter:
        minibatch_iter = tqdm(train_loader, desc="Minibatch", leave=False, position=0)
        for x_batch, y_batch in minibatch_iter:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            # Update max RAM usage
            max_ram = max(max_ram, get_mem() - mem_begin)
            optimizer.step()
            if gpu:
                max_vram = max(max_vram, torch.cuda.memory_allocated())
            minibatch_iter.set_postfix(loss=loss.item())
        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            epoch + 1, num_epochs, loss.item(),
            model.covar_module.base_kernel.lengthscale.item(),
            likelihood.noise.item()
        ))
    uTime = time.time() - start
    print("Time:", uTime)
    print("RAM:", max_ram, "MB")
    log_memory()
    
    # Evaluation
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu())**2).item()
    mse_l_svgp.append(mse)
    time_l_svgp.append(uTime)
    print("Test MSE:", mse)

# Summary
print(f"\nMSE: {round(statistics.mean(mse_l_svgp),5)} ± {round(statistics.stdev(mse_l_svgp),5)}")
print(f"Time: {round(statistics.mean(time_l_svgp),5)}s ± {round(statistics.stdev(time_l_svgp),5)}s")

# LaTeX-friendly output: mean & (std)
print(f"{round(statistics.mean(mse_l_svgp),5)}  & ({round(statistics.stdev(mse_l_svgp),5)})  & "
      f"{round(statistics.mean(time_l_svgp),5)}  & ({round(statistics.stdev(time_l_svgp),5)})")


In [None]:
# Summary
print(f"\nMSE: {round(statistics.mean(mse_l_svgp),5)} ± {round(statistics.stdev(mse_l_svgp),5)}")
print(f"Time: {round(statistics.mean(time_l_svgp),5)}s ± {round(statistics.stdev(time_l_svgp),5)}s")

# LaTeX-friendly output: mean & (std)
print(f"{round(statistics.mean(mse_l_svgp),5)}  & ({round(statistics.stdev(mse_l_svgp),5)})  & "
      f"{round(statistics.mean(time_l_svgp),5)}  & ({round(statistics.stdev(time_l_svgp),5)})")

# SKI - Can only handle up to 40,000 datapoints before running out of memory

In [None]:

model = None
likelihood = None

if gpu:
    gc.collect()


In [None]:
from tqdm import trange, tqdm
import statistics
import time

# Hyperparameters
n_replicates = 10
training_iterations = 30
gpu = torch.cuda.is_available()

# Define the Exact GP Regression Model using SKI
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        # Choose grid size for SKI
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x, 2)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5)),
                grid_size=grid_size, num_dims=2
            )
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Containers for metrics
mse_l_ski = []
time_l_ski = []

for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    # Split the data for SKI (assumes splitter returns train_x_ski, train_y_ski, test_x, test_y)
    train_x_ski, train_y_ski, test_x, test_y = splitter(all_x, all_y, n_train=40000, n_test=20000, random_state=rep)
    
    # Track RAM usage (get_mem() is assumed to be defined elsewhere)
    mem_begin = get_mem()
    max_vram = 0

    # Initialize model and likelihood
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x_ski, train_y_ski, likelihood)
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
        train_x_ski, train_y_ski, test_x, test_y = train_x_ski.cuda(), train_y_ski.cuda(), test_x.cuda(), test_y.cuda()
    
    print("Initial RAM:", (get_mem() - mem_begin) / (1024 ** 2), "MB")
    
    # Set to training mode and initialize optimizer and loss
    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if gpu:
        mll = mll.cuda()
    
    print("RAM after initialization:", (get_mem() - mem_begin) / (1024 ** 2), "MB")
    
    # Training loop with tqdm loss updates
    start = time.time()
    tbar = tqdm(range(training_iterations), desc="Train", leave=False, position=0)
    for _ in tbar:
        optimizer.zero_grad()
        if gpu:
            max_vram = max(max_vram, torch.cuda.memory_allocated())
        output = model(train_x_ski)
        loss = -mll(output, train_y_ski)
        loss.backward()
        if gpu:
            max_vram = max(max_vram, torch.cuda.memory_allocated())
        optimizer.step()
        tbar.set_postfix(loss=loss.item())
    uTime = time.time() - start
    print("Training Time:", uTime)
    print("RAM Usage:", (get_mem() - mem_begin), "MB")
    log_memory()
    
    # Evaluation in prior mode
    model.eval()
    with gpytorch.settings.prior_mode():
        output = model(test_x)
    means = output.mean.cpu()
    mse = torch.mean((means - test_y.cpu())**2).item()
    mse_l_ski.append(mse)
    time_l_ski.append(uTime)
    print("Test MSE:", mse)

# Summary of results
print(f"\nMSE: {round(statistics.mean(mse_l_ski),5)} ± {round(statistics.stdev(mse_l_ski),5)}")
print(f"Time: {round(statistics.mean(time_l_ski),5)}s ± {round(statistics.stdev(time_l_ski),5)}s")
print(f"{round(statistics.mean(mse_l_ski),5)}  & ({round(statistics.stdev(mse_l_ski),5)})  & {round(statistics.mean(time_l_ski),5)}  & ({round(statistics.stdev(time_l_ski),5)})")


# VNN

In [None]:
import gc
import time
import statistics
import torch
import gpytorch
import faiss  # if needed elsewhere
from torch.utils.data import TensorDataset, DataLoader
from tqdm import trange, tqdm

# Hyperparameters
my_batch_size = 32
n_replicates = 10
smoke_test = False
gpu = torch.cuda.is_available()

# Set training hyperparameters depending on smoke_test flag
if smoke_test:
    k = 32
    training_batch_size = 32
    num_epochs = 1
else:
    # You can adjust these values as needed.
    k = 160  # or 320 as required
    training_batch_size = 320 * 4
    num_epochs = 30  # or 30 as desired

# Define the GPModel using NNVariationalStrategy
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy

class GPModel(ApproximateGP):
    def __init__(self, inducing_points, likelihood, k=256, training_batch_size=256):
        # Get shape of inducing points
        m, d = inducing_points.shape
        self.m = m
        self.k = k
        print("Step 1")
        variational_distribution = gpytorch.variational.MeanFieldVariationalDistribution(m)
        if gpu:
            inducing_points = inducing_points.cuda()
        print("Step 2")
        variational_strategy = NNVariationalStrategy(
            self, inducing_points, variational_distribution,
            k=k, training_batch_size=training_batch_size
        )
        print("Step 21")
        super(GPModel, self).__init__(variational_strategy)
        print("Step 22")
        self.mean_module = gpytorch.means.ZeroMean()
        print("Step 23")
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=d)
        )
        print("Step 3")
        self.likelihood = likelihood

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()  # assumed to be defined elsewhere
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

    def __call__(self, x, prior=False, **kwargs):
        if x is not None:
            if x.dim() == 1:
                x = x.unsqueeze(-1)
        return self.variational_strategy(x=x, prior=False, **kwargs)

# Containers for metrics
mse_l_vnn = []
time_l_vnn = []

for rep in range(n_replicates):
    print(f"\n=== Replicate: {rep} ===")
    # Re-split the data for each replicate
    # Assumes splitter(all_x, all_y, n_train, n_test, random_state) returns:
    # train_x, train_y, test_x, test_y
    train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000, random_state=rep)
    
    # If GPU is available, move the data to GPU
    if gpu:
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x = test_x.cuda()
        test_y = test_y.cuda()
    
    # Create DataLoader for evaluation (training uses variational mini-batching inside the model)
    train_dataset = TensorDataset(train_x, train_y)
    test_dataset = TensorDataset(test_x, test_y)
    test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)
    
    mem_begin = get_mem()  # assumes get_mem() is defined elsewhere
    print("Memory before training:", (get_mem()-mem_begin)/(1024**2), "MB")
    
    # Initialize likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    # Note: for the inducing points, we use the full training set
    model = GPModel(inducing_points=train_x[::1].contiguous(), likelihood=likelihood, k=64, training_batch_size=training_batch_size)
    
    if gpu:
        likelihood = likelihood.cuda()
        model = model.cuda()
    
    print("Time since start:", time.time())
    
    # Set number of epochs (the number of outer iterations) based on your hyperparameters
    # num_batches is determined by the variational strategy internal variable
    num_batches = model.variational_strategy._total_training_batches

    model.train()
    likelihood.train()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    # Here we use the Exact Marginal Log Likelihood; adjust if needed.
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    begin_train = time.time()
    epochs_iter = trange(num_epochs, desc="Epoch", leave=True, position=0)
    for epoch in epochs_iter:
        minibatch_iter = tqdm(range(num_batches), leave=True, position=0)
        for batch_idx in minibatch_iter:
            optimizer.zero_grad()
            output = model(x=None)  # x is not used; model uses current_training_indices
            # Get current mini-batch indices from variational strategy
            current_training_indices = model.variational_strategy.current_training_indices
            # Retrieve the corresponding y_batch (ensure consistency between train_x and train_y)
            y_batch = train_y[..., current_training_indices]
            if gpu:
                y_batch = y_batch.cuda()
            loss = -mll(output, y_batch)
            minibatch_iter.set_postfix(loss=loss.item())
            loss.backward()
            vram_usage()  # Track VRAM usage (assumed defined)
            optimizer.step()
    uTime = time.time() - begin_train
    print("Training Time:", uTime)
    log_memory()
    print("RAM:", (get_mem()-mem_begin), "MB")
    
    # Evaluation on test set
    model.eval()
    likelihood.eval()
    means = torch.tensor([])  # Leave it on CPU

    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            if gpu:
                x_batch = x_batch.cuda()
                y_batch = y_batch.cuda()
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    mse = torch.mean((means - test_y.cpu())**2).item()
    mse_l_vnn.append(mse)
    time_l_vnn.append(uTime)
    print("Test MSE:", mse)
    
    # Clean up between replicates
    model = None
    likelihood = None
    mll = None
    optimizer = None
    epochs_iter = None
    gc.collect()
    print("Test MAE:", torch.mean((means - test_y.cpu())**2).item())

# Summary of results
print("\nSummary:")
print("Mean MSE:", statistics.mean(mse_l_vnn))
print("Std MSE:", statistics.stdev(mse_l_vnn))
print("Mean Time:", statistics.mean(time_l_vnn))
print("Std Time:", statistics.stdev(time_l_vnn))
print(f"{round(statistics.mean(mse_l_vnn),5)}  & ({round(statistics.stdev(mse_l_vnn),5)})  & {round(statistics.mean(time_l_vnn),5)}  & ({round(statistics.stdev(time_l_vnn),5)})")



=== Replicate: 0 ===
Memory before training: 0.0 MB
Step 1
Step 2


  x.storage().data_ptr() + x.storage_offset() * 4)


Step 21
Step 22
Step 23
Step 3
Time since start: 1742979758.891962


100%|██████████| 64/64 [00:00<00:00, 127.14it/s, loss=1.42]
100%|██████████| 64/64 [00:00<00:00, 184.21it/s, loss=1.4] 
100%|██████████| 64/64 [00:00<00:00, 176.86it/s, loss=1.27]
100%|██████████| 64/64 [00:00<00:00, 194.59it/s, loss=1.24]
100%|██████████| 64/64 [00:00<00:00, 184.77it/s, loss=1.08]
100%|██████████| 64/64 [00:00<00:00, 191.66it/s, loss=0.922]
100%|██████████| 64/64 [00:00<00:00, 191.68it/s, loss=0.0861]
100%|██████████| 64/64 [00:00<00:00, 185.52it/s, loss=-0.37] 
100%|██████████| 64/64 [00:00<00:00, 192.98it/s, loss=-0.358]
100%|██████████| 64/64 [00:00<00:00, 189.93it/s, loss=-0.222]
100%|██████████| 64/64 [00:00<00:00, 193.13it/s, loss=-0.373]
100%|██████████| 64/64 [00:00<00:00, 196.09it/s, loss=-0.683]
100%|██████████| 64/64 [00:00<00:00, 184.78it/s, loss=-0.643]
100%|██████████| 64/64 [00:00<00:00, 189.99it/s, loss=-0.723]
100%|██████████| 64/64 [00:00<00:00, 183.93it/s, loss=-0.719]
100%|██████████| 64/64 [00:00<00:00, 190.81it/s, loss=-0.682]
100%|██████████| 64

Training Time: 10.50261640548706
[GPU] Used: 3539.25 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.74 GB
RAM: 657.1015625 MB
Test MSE: 0.7416879534721375
Test MAE: 0.7416879534721375

=== Replicate: 1 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742979805.1846957


100%|██████████| 64/64 [00:00<00:00, 179.04it/s, loss=1.43]
100%|██████████| 64/64 [00:00<00:00, 190.32it/s, loss=1.34]
100%|██████████| 64/64 [00:00<00:00, 186.29it/s, loss=1.36]
100%|██████████| 64/64 [00:00<00:00, 177.64it/s, loss=1.16]
100%|██████████| 64/64 [00:00<00:00, 181.93it/s, loss=1.02]
100%|██████████| 64/64 [00:00<00:00, 183.78it/s, loss=0.767]
100%|██████████| 64/64 [00:00<00:00, 188.67it/s, loss=0.148] 
100%|██████████| 64/64 [00:00<00:00, 186.28it/s, loss=-0.352]
100%|██████████| 64/64 [00:00<00:00, 185.39it/s, loss=-0.315]
100%|██████████| 64/64 [00:00<00:00, 192.29it/s, loss=-0.254]
100%|██████████| 64/64 [00:00<00:00, 188.10it/s, loss=-0.434]
100%|██████████| 64/64 [00:00<00:00, 183.92it/s, loss=-0.759]
100%|██████████| 64/64 [00:00<00:00, 187.77it/s, loss=-0.628]
100%|██████████| 64/64 [00:00<00:00, 184.10it/s, loss=-0.702]
100%|██████████| 64/64 [00:00<00:00, 185.78it/s, loss=-0.74] 
100%|██████████| 64/64 [00:00<00:00, 188.02it/s, loss=-0.699]
100%|██████████| 64

Training Time: 10.33055830001831
[GPU] Used: 3538.87 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.74 GB
RAM: 255.6640625 MB
Test MSE: 0.7424277067184448
Test MAE: 0.7424277067184448

=== Replicate: 2 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742979850.7772772


100%|██████████| 64/64 [00:00<00:00, 186.43it/s, loss=1.42]
100%|██████████| 64/64 [00:00<00:00, 189.66it/s, loss=1.4] 
100%|██████████| 64/64 [00:00<00:00, 186.59it/s, loss=1.33]
100%|██████████| 64/64 [00:00<00:00, 188.37it/s, loss=1.2] 
100%|██████████| 64/64 [00:00<00:00, 187.83it/s, loss=1.1]  
100%|██████████| 64/64 [00:00<00:00, 184.92it/s, loss=0.838]
100%|██████████| 64/64 [00:00<00:00, 184.99it/s, loss=-0.00252]
100%|██████████| 64/64 [00:00<00:00, 186.02it/s, loss=-0.304]
100%|██████████| 64/64 [00:00<00:00, 184.53it/s, loss=-0.333]
100%|██████████| 64/64 [00:00<00:00, 186.52it/s, loss=-0.255]
100%|██████████| 64/64 [00:00<00:00, 188.22it/s, loss=-0.406]
100%|██████████| 64/64 [00:00<00:00, 188.88it/s, loss=-0.838]
100%|██████████| 64/64 [00:00<00:00, 185.64it/s, loss=-0.667]
100%|██████████| 64/64 [00:00<00:00, 188.06it/s, loss=-0.824]
100%|██████████| 64/64 [00:00<00:00, 184.39it/s, loss=-0.763]
100%|██████████| 64/64 [00:00<00:00, 183.39it/s, loss=-0.739]
100%|██████████|

Training Time: 10.350124597549438
[GPU] Used: 3534.79 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.71 GB
RAM: 255.859375 MB
Test MSE: 0.741576611995697
Test MAE: 0.741576611995697

=== Replicate: 3 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742979896.2889376


100%|██████████| 64/64 [00:00<00:00, 188.56it/s, loss=1.39]
100%|██████████| 64/64 [00:00<00:00, 193.41it/s, loss=1.39]
100%|██████████| 64/64 [00:00<00:00, 188.06it/s, loss=1.29]
100%|██████████| 64/64 [00:00<00:00, 189.93it/s, loss=1.23]
100%|██████████| 64/64 [00:00<00:00, 194.94it/s, loss=1.1] 
100%|██████████| 64/64 [00:00<00:00, 194.09it/s, loss=0.68] 
100%|██████████| 64/64 [00:00<00:00, 188.88it/s, loss=0.15]  
100%|██████████| 64/64 [00:00<00:00, 192.37it/s, loss=-0.258]
100%|██████████| 64/64 [00:00<00:00, 188.65it/s, loss=-0.289]
100%|██████████| 64/64 [00:00<00:00, 189.80it/s, loss=-0.252]
100%|██████████| 64/64 [00:00<00:00, 191.38it/s, loss=-0.419]
100%|██████████| 64/64 [00:00<00:00, 185.01it/s, loss=-0.807]
100%|██████████| 64/64 [00:00<00:00, 182.14it/s, loss=-0.646]
100%|██████████| 64/64 [00:00<00:00, 181.99it/s, loss=-0.759]
100%|██████████| 64/64 [00:00<00:00, 179.12it/s, loss=-0.574]
100%|██████████| 64/64 [00:00<00:00, 178.78it/s, loss=-0.551]
100%|██████████| 64

Training Time: 10.512325763702393
[GPU] Used: 3504.29 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.77 GB
RAM: 255.80078125 MB
Test MSE: 0.7372857332229614
Test MAE: 0.7372857332229614

=== Replicate: 4 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742979941.9534883


100%|██████████| 64/64 [00:00<00:00, 177.85it/s, loss=1.35]
100%|██████████| 64/64 [00:00<00:00, 179.32it/s, loss=1.32]
100%|██████████| 64/64 [00:00<00:00, 175.30it/s, loss=1.31]
100%|██████████| 64/64 [00:00<00:00, 179.32it/s, loss=1.13]
100%|██████████| 64/64 [00:00<00:00, 177.77it/s, loss=1.06]
100%|██████████| 64/64 [00:00<00:00, 179.76it/s, loss=0.643]
100%|██████████| 64/64 [00:00<00:00, 182.97it/s, loss=-0.0847] 
100%|██████████| 64/64 [00:00<00:00, 178.61it/s, loss=-0.448]
100%|██████████| 64/64 [00:00<00:00, 177.68it/s, loss=-0.392]
100%|██████████| 64/64 [00:00<00:00, 180.39it/s, loss=-0.325]
100%|██████████| 64/64 [00:00<00:00, 175.40it/s, loss=-0.312]
100%|██████████| 64/64 [00:00<00:00, 181.70it/s, loss=-0.837]
100%|██████████| 64/64 [00:00<00:00, 178.85it/s, loss=-0.578]
100%|██████████| 64/64 [00:00<00:00, 180.82it/s, loss=-0.668]
100%|██████████| 64/64 [00:00<00:00, 173.99it/s, loss=-0.74] 
100%|██████████| 64/64 [00:00<00:00, 176.34it/s, loss=-0.661]
100%|██████████| 

Training Time: 10.744665384292603
[GPU] Used: 3503.26 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.62 GB
RAM: 255.8125 MB
Test MSE: 0.7406355738639832
Test MAE: 0.7406355738639832

=== Replicate: 5 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742979990.784418


100%|██████████| 64/64 [00:00<00:00, 141.26it/s, loss=1.47]
100%|██████████| 64/64 [00:00<00:00, 126.54it/s, loss=1.38]
100%|██████████| 64/64 [00:00<00:00, 144.97it/s, loss=1.34]
100%|██████████| 64/64 [00:00<00:00, 143.22it/s, loss=1.2] 
100%|██████████| 64/64 [00:00<00:00, 150.42it/s, loss=1.12] 
100%|██████████| 64/64 [00:00<00:00, 153.19it/s, loss=0.738]
100%|██████████| 64/64 [00:00<00:00, 146.75it/s, loss=-0.00962]
100%|██████████| 64/64 [00:00<00:00, 166.29it/s, loss=-0.316]
100%|██████████| 64/64 [00:00<00:00, 170.09it/s, loss=-0.379]
100%|██████████| 64/64 [00:00<00:00, 177.10it/s, loss=-0.272]
100%|██████████| 64/64 [00:00<00:00, 161.36it/s, loss=-0.337]
100%|██████████| 64/64 [00:00<00:00, 174.70it/s, loss=-0.827]
100%|██████████| 64/64 [00:00<00:00, 161.65it/s, loss=-0.695]
100%|██████████| 64/64 [00:00<00:00, 164.13it/s, loss=-0.649]
100%|██████████| 64/64 [00:00<00:00, 184.81it/s, loss=-0.702]
100%|██████████| 64/64 [00:00<00:00, 188.59it/s, loss=-0.749]
100%|██████████|

Training Time: 11.735092639923096
[GPU] Used: 4385.17 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 20.18 GB
RAM: 255.796875 MB
Test MSE: 0.7341556549072266
Test MAE: 0.7341556549072266

=== Replicate: 6 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742980037.263741


100%|██████████| 64/64 [00:00<00:00, 181.90it/s, loss=1.46]
100%|██████████| 64/64 [00:00<00:00, 184.47it/s, loss=1.38]
100%|██████████| 64/64 [00:00<00:00, 180.23it/s, loss=1.28]
100%|██████████| 64/64 [00:00<00:00, 182.76it/s, loss=1.19]
100%|██████████| 64/64 [00:00<00:00, 184.94it/s, loss=1.03]
100%|██████████| 64/64 [00:00<00:00, 183.89it/s, loss=0.71] 
100%|██████████| 64/64 [00:00<00:00, 183.57it/s, loss=0.0926] 
100%|██████████| 64/64 [00:00<00:00, 183.43it/s, loss=-0.378]
100%|██████████| 64/64 [00:00<00:00, 175.76it/s, loss=-0.357]
100%|██████████| 64/64 [00:00<00:00, 177.93it/s, loss=-0.335]
100%|██████████| 64/64 [00:00<00:00, 184.38it/s, loss=-0.307]
100%|██████████| 64/64 [00:00<00:00, 183.44it/s, loss=-0.727]
100%|██████████| 64/64 [00:00<00:00, 178.23it/s, loss=-0.672]
100%|██████████| 64/64 [00:00<00:00, 188.21it/s, loss=-0.771]
100%|██████████| 64/64 [00:00<00:00, 182.83it/s, loss=-0.671]
100%|██████████| 64/64 [00:00<00:00, 187.23it/s, loss=-0.903]
100%|██████████| 6

Training Time: 10.54262375831604
[GPU] Used: 4384.27 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.70 GB
RAM: 255.80078125 MB
Test MSE: 0.7324655652046204
Test MAE: 0.7324655652046204

=== Replicate: 7 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742980081.9121838


100%|██████████| 64/64 [00:00<00:00, 192.68it/s, loss=1.38]
100%|██████████| 64/64 [00:00<00:00, 196.13it/s, loss=1.29]
100%|██████████| 64/64 [00:00<00:00, 199.48it/s, loss=1.31]
100%|██████████| 64/64 [00:00<00:00, 190.63it/s, loss=1.17]
100%|██████████| 64/64 [00:00<00:00, 191.78it/s, loss=1.16]
100%|██████████| 64/64 [00:00<00:00, 194.28it/s, loss=0.693]
100%|██████████| 64/64 [00:00<00:00, 190.10it/s, loss=0.0808]
100%|██████████| 64/64 [00:00<00:00, 188.50it/s, loss=-0.304]
100%|██████████| 64/64 [00:00<00:00, 190.39it/s, loss=-0.378]
100%|██████████| 64/64 [00:00<00:00, 187.38it/s, loss=-0.173]
100%|██████████| 64/64 [00:00<00:00, 187.60it/s, loss=-0.387]
100%|██████████| 64/64 [00:00<00:00, 194.10it/s, loss=-0.807]
100%|██████████| 64/64 [00:00<00:00, 190.39it/s, loss=-0.631]
100%|██████████| 64/64 [00:00<00:00, 191.61it/s, loss=-0.767]
100%|██████████| 64/64 [00:00<00:00, 192.70it/s, loss=-0.819]
100%|██████████| 64/64 [00:00<00:00, 196.41it/s, loss=-0.563]
100%|██████████| 64

Training Time: 10.07969045639038
[GPU] Used: 4363.63 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.76 GB
RAM: 255.796875 MB
Test MSE: 0.7241892218589783
Test MAE: 0.7241892218589783

=== Replicate: 8 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742980126.808674


100%|██████████| 64/64 [00:00<00:00, 183.38it/s, loss=1.41]
100%|██████████| 64/64 [00:00<00:00, 192.40it/s, loss=1.39]
100%|██████████| 64/64 [00:00<00:00, 199.30it/s, loss=1.34]
100%|██████████| 64/64 [00:00<00:00, 194.75it/s, loss=1.25]
100%|██████████| 64/64 [00:00<00:00, 196.84it/s, loss=0.997]
100%|██████████| 64/64 [00:00<00:00, 195.44it/s, loss=0.661]
100%|██████████| 64/64 [00:00<00:00, 195.13it/s, loss=0.0519]
100%|██████████| 64/64 [00:00<00:00, 199.39it/s, loss=-0.343]
100%|██████████| 64/64 [00:00<00:00, 192.24it/s, loss=-0.285]
100%|██████████| 64/64 [00:00<00:00, 195.43it/s, loss=-0.305]
100%|██████████| 64/64 [00:00<00:00, 195.47it/s, loss=-0.356]
100%|██████████| 64/64 [00:00<00:00, 188.13it/s, loss=-0.762]
100%|██████████| 64/64 [00:00<00:00, 190.31it/s, loss=-0.68] 
100%|██████████| 64/64 [00:00<00:00, 193.36it/s, loss=-0.748]
100%|██████████| 64/64 [00:00<00:00, 192.90it/s, loss=-0.654]
100%|██████████| 64/64 [00:00<00:00, 186.00it/s, loss=-0.74] 
100%|██████████| 6

Training Time: 10.065503597259521
[GPU] Used: 4328.72 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.64 GB
RAM: 255.796875 MB
Test MSE: 0.7310738563537598
Test MAE: 0.7310738563537598

=== Replicate: 9 ===
Memory before training: 0.0 MB
Step 1
Step 2
Step 21
Step 22
Step 23
Step 3
Time since start: 1742980171.4772363


100%|██████████| 64/64 [00:00<00:00, 189.80it/s, loss=1.43]
100%|██████████| 64/64 [00:00<00:00, 195.61it/s, loss=1.35]
100%|██████████| 64/64 [00:00<00:00, 195.30it/s, loss=1.38]
100%|██████████| 64/64 [00:00<00:00, 188.81it/s, loss=1.25]
100%|██████████| 64/64 [00:00<00:00, 195.11it/s, loss=1.01]
100%|██████████| 64/64 [00:00<00:00, 193.41it/s, loss=0.689]
100%|██████████| 64/64 [00:00<00:00, 194.38it/s, loss=0.013]   
100%|██████████| 64/64 [00:00<00:00, 190.11it/s, loss=-0.282]
100%|██████████| 64/64 [00:00<00:00, 185.08it/s, loss=-0.291]
100%|██████████| 64/64 [00:00<00:00, 193.39it/s, loss=-0.29] 
100%|██████████| 64/64 [00:00<00:00, 191.26it/s, loss=-0.371]
100%|██████████| 64/64 [00:00<00:00, 185.25it/s, loss=-0.898]
100%|██████████| 64/64 [00:00<00:00, 188.28it/s, loss=-0.732]
100%|██████████| 64/64 [00:00<00:00, 193.04it/s, loss=-0.618]
100%|██████████| 64/64 [00:00<00:00, 187.96it/s, loss=-0.659]
100%|██████████| 64/64 [00:00<00:00, 188.95it/s, loss=-0.688]
100%|██████████| 

Training Time: 10.16049861907959
[GPU] Used: 4330.16 MB
[PyTorch] Max Allocated: 245.70 MB
[System RAM] Used: 19.50 GB
RAM: 255.77734375 MB
Test MSE: 0.7243191003799438
Test MAE: 0.7243191003799438

Summary:
Mean MSE: 0.7349816977977752
Std MSE: 0.00694132325302037
Mean Time: 10.502369952201843
Std Time: 0.4852394503488141
0.73498  & (0.00694)  & 10.50237  & (0.48524)


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())


# Compile Table (MSE and Time only)

SKI
SGPR
LOVE
DKL
SVGP-CI
SVGP
NGD
VNN


In [None]:
print("SKI     --- MSE:",statistics.mean(mse_l_ski), "(",statistics.stdev(mse_l_ski),")  Time:", statistics.mean(time_l_ski), "(",statistics.stdev(time_l_ski),")")
print("SGPR    --- MSE:",statistics.mean(mse_l_sgpr), "(",statistics.stdev(mse_l_sgpr),")  Time:", statistics.mean(time_l_sgpr), "(",statistics.stdev(time_l_sgpr),")")
print("LOVE    --- MSE:",statistics.mean(mse_l_love), "(",statistics.stdev(mse_l_love),")  Time:", statistics.mean(time_l_love), "(",statistics.stdev(time_l_love),")")
print("DKL     --- MSE:",statistics.mean(mse_l_dkl), "(",statistics.stdev(mse_l_dkl),")  Time:", statistics.mean(time_l_dkl), "(",statistics.stdev(time_l_dkl),")")
print("SVGP-CI --- MSE:",statistics.mean(mse_l_svgpci), "(",statistics.stdev(mse_l_svgpci),")  Time:", statistics.mean(time_l_svgpci), "(",statistics.stdev(time_l_svgpci),")")
print("SVGP    --- MSE:",statistics.mean(mse_l_svgp), "(",statistics.stdev(mse_l_svgp),")  Time:", statistics.mean(time_l_svgp), "(",statistics.stdev(time_l_svgp),")")
print("NGD     --- MSE:",statistics.mean(mse_l_ngd), "(",statistics.stdev(mse_l_ngd),")  Time:", statistics.mean(time_l_ngd), "(",statistics.stdev(time_l_ngd),")")
print("VNN     --- MSE:",statistics.mean(mse_l_vnn), "(",statistics.stdev(mse_l_vnn),")  Time:", statistics.mean(time_l_vnn), "(",statistics.stdev(time_l_vnn),")")

Reordering
SVGP
SVGP-CI
VNN
NGD
DKL
SGPR
SKI
LOVE

In [None]:
print("SVGP    --- MSE:",statistics.mean(mse_l_svgp), "(",statistics.stdev(mse_l_svgp),")  Time:", statistics.mean(time_l_svgp), "(",statistics.stdev(time_l_svgp),")")
print("SVGP-CI --- MSE:",statistics.mean(mse_l_svgpci), "(",statistics.stdev(mse_l_svgpci),")  Time:", statistics.mean(time_l_svgpci), "(",statistics.stdev(time_l_svgpci),")")
print("VNN     --- MSE:",statistics.mean(mse_l_vnn), "(",statistics.stdev(mse_l_vnn),")  Time:", statistics.mean(time_l_vnn), "(",statistics.stdev(time_l_vnn),")")
print("NGD     --- MSE:",statistics.mean(mse_l_ngd), "(",statistics.stdev(mse_l_ngd),")  Time:", statistics.mean(time_l_ngd), "(",statistics.stdev(time_l_ngd),")")
print("DKL     --- MSE:",statistics.mean(mse_l_dkl), "(",statistics.stdev(mse_l_dkl),")  Time:", statistics.mean(time_l_dkl), "(",statistics.stdev(time_l_dkl),")")
print("SGPR    --- MSE:",statistics.mean(mse_l_sgpr), "(",statistics.stdev(mse_l_sgpr),")  Time:", statistics.mean(time_l_sgpr), "(",statistics.stdev(time_l_sgpr),")")
print("SKI     --- MSE:",statistics.mean(mse_l_ski), "(",statistics.stdev(mse_l_ski),")  Time:", statistics.mean(time_l_ski), "(",statistics.stdev(time_l_ski),")")
print("LOVE    --- MSE:",statistics.mean(mse_l_love), "(",statistics.stdev(mse_l_love),")  Time:", statistics.mean(time_l_love), "(",statistics.stdev(time_l_love),")")

In [None]:
import torch
print(torch.cuda.is_available())  # Should print True if a GPU is accessible
