# Important Note
RAM and VRAM measurements are dependent on the computer state, and should only be interpreted relative to each other. In order to obtain RAM and VRAM measurements, perform the following steps:

1 - Restart the Kernel

2 - Run the "Loading Required Packages and Helper Functions" cell

3 - Run the "Loading Data" cell

4 - Run ONLY ONE iteration of the desired method, and read the RAM and VRAM usage reports printed by the cell

# Loading Required Packages and Helper Functions
If you would like to use Cuda, set gpu = True. Otherwise set gpu = False. 

Step 1: Run the following cell to import the required packages and helper functions. Set the number of replicates desired.

Step 2: Load the Data

Step 3: Execute the cells under the method you wish to replicate.

# Step 1

In [None]:
gpu = True
n_replicates = 10

In [2]:
import math
import torch
import gpytorch
import time
from matplotlib import pyplot as plt
import gc
import statistics
import numpy as np

from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
# Make plots inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import psutil
def get_mem():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss
import torch
import numpy as np
import time
import gc
import tqdm
import gpytorch
import tqdm
import urllib.request
import os
import pandas as pd
from scipy.io import loadmat
from math import floor
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL

from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal
from torch.utils.data import TensorDataset, DataLoader

from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL
from torch.utils.data import TensorDataset, DataLoader
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy

import time
import torch
import gpytorch
import pynvml
import psutil
import statistics
from tqdm import trange, tqdm

import torch
import pynvml
import psutil

def log_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    max_allocated = torch.cuda.max_memory_allocated() / 1024**2  # MB
    max_reserved = torch.cuda.max_memory_reserved() / 1024**2    # MB
    gpu_used = meminfo.used / 1024**2                            # MB
    sys_used = psutil.virtual_memory().used / 1024**3            # GB
    print(f"[PyTorch] Max Allocated: {max_allocated:.2f} MB | Max Reserved: {max_reserved:.2f} MB")
    print(f"[GPU VRAM] Used (nvidia-smi): {gpu_used:.2f} MB | [System RAM]: {sys_used:.2f} GB")
    return max_allocated, max_reserved, gpu_used, sys_used


max_vram = 0
max_ram = 0
def get_mem():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss/(1024**2)

max_vram = 0
def vram_usage():
    global max_vram
    if gpu:
        max_vram = max(max_vram, torch.cuda.memory_allocated())


print("GPU availability: ", torch.cuda.is_available())
print(psutil.virtual_memory().used / (1024 ** 2))




GPU availability:  True
22408.5390625


# Loading Data
Step 2: Load the data (note: must run the DataGenerator.Rmd file first)

In [3]:
import pandas as pd
import torch

x = pd.read_csv('Data/x_100k.csv', header=None).values.squeeze()
y = pd.read_csv('Data/y_100k.csv', header=None).values.squeeze()
all_x = torch.tensor(x, dtype=torch.float32).unsqueeze(1)
all_y = torch.tensor(y, dtype=torch.float32)
all_x = all_x.contiguous()
all_y = all_y.contiguous()
print("all_x shape:", all_x.shape)
print("all_y shape:", all_y.shape)


all_x shape: torch.Size([1000000, 1])
all_y shape: torch.Size([1000000])


In [4]:
def splitter(x_cpu, y_cpu, n_train=80000, n_test=20000, random_state=42, move_to_gpu=True):
    assert x_cpu.shape[0] == y_cpu.shape[0], "Mismatch in number of samples"
    total_samples = x_cpu.shape[0]
    assert n_train + n_test <= total_samples, "Not enough samples to split"
    rng = np.random.default_rng(seed=random_state)
    indices = rng.permutation(total_samples)
    train_idx = indices[:n_train]
    test_idx  = indices[n_train:n_train + n_test]
    train_x = x_cpu[train_idx].contiguous()
    train_y = y_cpu[train_idx].contiguous()
    test_x  = x_cpu[test_idx].contiguous()
    test_y  = y_cpu[test_idx].contiguous()
    if move_to_gpu and torch.cuda.is_available():
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x = test_x.cuda()
        test_y = test_y.cuda()
    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

torch.Size([80000, 1]) torch.Size([80000])
torch.Size([20000, 1]) torch.Size([20000])


In [5]:
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################

# Simulations
Step 3: Execute the simulations to be reproduced. If all simulations are run, there is a summarizer at the end. Otherwise, the relevant statistics are printed at the end of each method.

# SKI

In [6]:
import tqdm, time, gc
import torch, gpytorch
from memory_profiler import memory_usage


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x, 1.0 / 25.0)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=1),
                grid_size=grid_size, num_dims=1
            )
        )
    def forward(self, x):
        return gpytorch.distributions.MultivariateNormal(
            self.mean_module(x),
            self.covar_module(x)
        )

n_replicates = 11
training_iterations = 10#32
n_train, n_test = 400_000, 20_000
random_state = 42
mse_l_ski, time_l_ski = [], []

for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=n_train, n_test=n_test,
        random_state=random_state + rep,
        move_to_gpu=torch.cuda.is_available()
    )
    ram_before = get_mem() / (1024**2)
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    model.train(); likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if torch.cuda.is_available():
        mll = mll.cuda()
    def train_fn():
        for _ in range(training_iterations):
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()
        return None
    start_time = time.time()
    peak_ram = memory_usage(
        (train_fn,),
        max_usage=True,
        retval=False,
        interval=0.01
    )
    elapsed = time.time() - start_time
    vram_peak = torch.cuda.max_memory_allocated() / (1024**2) if torch.cuda.is_available() else None
    ram_delta = peak_ram - ram_before
    model.eval(); likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu()) ** 2).item()
    mse_l_ski.append(mse)
    time_l_ski.append(elapsed)
    print(
        f"Rep {rep+1}: MSE={mse:.4f}, Time={elapsed:.2f}s, "
        f"RAM before={ram_before:.1f}MB, peak={peak_ram:.1f}MB (Δ={ram_delta:.1f}MB)"
        + (f", VRAM peak={vram_peak:.1f}MB" if vram_peak is not None else "")
    )
    del model, likelihood
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


  if nonzero_indices.storage():
  res = cls(index_tensor, value_tensor, interp_size)
  res = cls(index_tensor, value_tensor, interp_size)


Rep 1: MSE=0.3269, Time=3.40s, RAM before=0.0MB, peak=1033.9MB (Δ=1033.9MB), VRAM peak=5732.2MB

=== Replicate 2/11 ===
Rep 2: MSE=0.3249, Time=3.10s, RAM before=0.0MB, peak=1044.3MB (Δ=1044.3MB), VRAM peak=5778.9MB

=== Replicate 3/11 ===
Rep 3: MSE=0.3252, Time=3.10s, RAM before=0.0MB, peak=1044.8MB (Δ=1044.8MB), VRAM peak=5780.0MB

=== Replicate 4/11 ===
Rep 4: MSE=0.3286, Time=3.06s, RAM before=0.0MB, peak=1044.8MB (Δ=1044.8MB), VRAM peak=5780.3MB

=== Replicate 5/11 ===
Rep 5: MSE=0.3288, Time=3.04s, RAM before=0.0MB, peak=1044.8MB (Δ=1044.8MB), VRAM peak=5779.7MB

=== Replicate 6/11 ===
Rep 6: MSE=0.3307, Time=3.03s, RAM before=0.0MB, peak=1047.8MB (Δ=1047.8MB), VRAM peak=5780.1MB

=== Replicate 7/11 ===
Rep 7: MSE=0.3267, Time=2.97s, RAM before=0.0MB, peak=1047.8MB (Δ=1047.8MB), VRAM peak=5779.9MB

=== Replicate 8/11 ===
Rep 8: MSE=0.3266, Time=3.00s, RAM before=0.0MB, peak=1048.1MB (Δ=1048.1MB), VRAM peak=5780.1MB

=== Replicate 9/11 ===
Rep 9: MSE=0.3298, Time=2.99s, RAM befor

In [7]:
print(statistics.mean(mse_l_ski[1:]))
print(statistics.stdev(mse_l_ski[1:]))

print(statistics.mean(time_l_ski[1:]))
print(statistics.stdev(time_l_ski[1:]))


0.32774763703346255
0.002075905921105616
3.0282730579376222
0.04694740011788096


# Sparse GPR

In [8]:
gc.collect()
max_vram = 0

In [9]:
import torch
import gpytorch
import tqdm
import gc
import time
import numpy as np



class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        base_covar = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5)
        )
        self.covar_module = gpytorch.kernels.InducingPointKernel(
            base_covar,
            #inducing_points=train_x[::15000].clone(),
            inducing_points=train_x[torch.randperm(train_x.shape[0])[:100]].clone(),
            likelihood=likelihood
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean()
        self.base_covar_module = ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2))
        self.covar_module = InducingPointKernel(self.base_covar_module, inducing_points=torch.linspace(train_x.min(), train_x.max(), steps=40).unsqueeze(-1), likelihood=likelihood)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        return MultivariateNormal(mean_x, covar_x)
    

# Experiment Parameters
# -----------------------------
n_replicates = 11
training_iterations = 10#125
n_train, n_test = 400000, 20000
random_state = 422
gpu = torch.cuda.is_available()
gpu = True

mse_l_sgpr = []
time_l_sgpr = []
vram_l_sgpr = []

# -----------------------------
# Replicates Loop
# -----------------------------
for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")

    # ---- Split Data ----
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=n_train, n_test=n_test,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )
    ram_init = get_mem()
    # ---- Initialize ----
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    model = model.double()
    likelihood = likelihood.double()

    # ---- Train ----
    start = time.time()
    iterator = tqdm.tqdm(range(training_iterations), desc=f"Train {rep + 1}")

    for it in iterator:
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        iterator.set_postfix(loss=loss.item())  # <-- Add this line
        optimizer.step()
        torch.cuda.empty_cache()
    elapsed = time.time() - start

    ram_diff = get_mem() - ram_init
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu()) ** 2).item()
    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()
    mse_l_sgpr.append(mse)
    time_l_sgpr.append(elapsed)
    vram_l_sgpr.append(peak_alloc)
    print(f"Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, VRAM={peak_alloc:.2f}MB, RAM diff={ram_diff:.2f}MB")
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


Train 1: 100%|██████████| 10/10 [00:00<00:00, 12.54it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3152.32 MB | [System RAM]: 22.39 GB
Rep 1: MSE=0.4671, Time=0.80s, VRAM=5779.93MB, RAM diff=12.77MB

=== Replicate 2/11 ===


Train 2: 100%|██████████| 10/10 [00:00<00:00, 13.08it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3466.93 MB | [System RAM]: 22.39 GB
Rep 2: MSE=0.4746, Time=0.76s, VRAM=5779.93MB, RAM diff=0.23MB

=== Replicate 3/11 ===


Train 3: 100%|██████████| 10/10 [00:00<00:00, 13.06it/s, loss=1.19]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3466.93 MB | [System RAM]: 22.39 GB
Rep 3: MSE=0.4822, Time=0.77s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 4/11 ===


Train 4: 100%|██████████| 10/10 [00:00<00:00, 13.26it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3450.93 MB | [System RAM]: 22.39 GB
Rep 4: MSE=0.4795, Time=0.76s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 5/11 ===


Train 5: 100%|██████████| 10/10 [00:00<00:00, 13.37it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3450.93 MB | [System RAM]: 22.39 GB
Rep 5: MSE=0.4785, Time=0.75s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 6/11 ===


Train 6: 100%|██████████| 10/10 [00:00<00:00, 13.39it/s, loss=1.19]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3423.31 MB | [System RAM]: 22.39 GB
Rep 6: MSE=0.4734, Time=0.75s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 7/11 ===


Train 7: 100%|██████████| 10/10 [00:00<00:00, 13.36it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3423.31 MB | [System RAM]: 22.39 GB
Rep 7: MSE=0.4690, Time=0.75s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 8/11 ===


Train 8: 100%|██████████| 10/10 [00:00<00:00, 13.46it/s, loss=1.19]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3423.31 MB | [System RAM]: 22.39 GB
Rep 8: MSE=0.4765, Time=0.74s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 9/11 ===


Train 9: 100%|██████████| 10/10 [00:00<00:00, 12.86it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3409.31 MB | [System RAM]: 22.39 GB
Rep 9: MSE=0.4793, Time=0.78s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 10/11 ===


Train 10: 100%|██████████| 10/10 [00:00<00:00, 13.43it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3409.31 MB | [System RAM]: 22.39 GB
Rep 10: MSE=0.4751, Time=0.75s, VRAM=5779.93MB, RAM diff=0.00MB

=== Replicate 11/11 ===


Train 11: 100%|██████████| 10/10 [00:00<00:00, 13.48it/s, loss=1.2]


[PyTorch] Max Allocated: 5779.93 MB | Max Reserved: 7968.00 MB
[GPU VRAM] Used (nvidia-smi): 3409.31 MB | [System RAM]: 22.39 GB
Rep 11: MSE=0.4752, Time=0.74s, VRAM=5779.93MB, RAM diff=0.00MB


In [10]:
print(statistics.mean(mse_l_sgpr[1:]))
print(statistics.stdev(mse_l_sgpr[1:]))

print(statistics.mean(time_l_sgpr[1:]))
print(statistics.stdev(time_l_sgpr[1:]))

0.47633379463561154
0.0037449226041651297
0.7551281929016114
0.01192968158144584


# LOVE

In [26]:
# Lanczos Variance Estimates (LOVE)
max_vram = 0
gc.collect()
if gpu:
    torch.cuda.empty_cache()

In [None]:
import torch
import gpytorch
import tqdm
import time
import gc
import numpy as np
import psutil

# -----------------------------
# Feature Extractor
# -----------------------------
class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self, input_dim):
        super().__init__()
        self.add_module('linear1', torch.nn.Linear(input_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 1))

# -----------------------------
# GPRegressionModel
# -----------------------------
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=1)
            ),
            grid_size=100, num_dims=1
        )
        self.feature_extractor = LargeFeatureExtractor(input_dim=train_x.size(-1))

    def forward(self, x):
        projected_x = self.feature_extractor(x)
        projected_x = projected_x - projected_x.min(0)[0]
        projected_x = 2 * (projected_x / projected_x.max(0)[0]) - 1
        mean_x = self.mean_module(projected_x)
        covar_x = self.covar_module(projected_x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# -----------------------------
# Experiment Parameters
# -----------------------------
n_replicates = 11
# training_iterations = 10#100
# n_train, n_test = 80000, 20000
training_iterations = 10#25
n_train, n_test = 400000, 100000
random_state = 42
gpu = torch.cuda.is_available()

mse_l_love = []
time_l_love = []
vram_l_love = []
ram_l_love = []

# -----------------------------
# Replicates Loop
# -----------------------------
for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")

    # ---- Split Data ----
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=n_train, n_test=n_test,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )

    # ---- RAM before model ----
    mem_begin = psutil.virtual_memory().used / (1024 ** 2)  # in MB

    # ---- Initialize ----
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if gpu:
        mll = mll.cuda()

    torch.cuda.reset_peak_memory_stats()

    # ---- Train ----
    start = time.time()
    iterator = tqdm.tqdm(range(training_iterations), desc=f"Train {rep + 1}")
    for it in iterator:
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()
        iterator.set_postfix(loss=loss.item())
    elapsed = time.time() - start

    # ---- RAM after ----
    mem_end = psutil.virtual_memory().used / (1024 ** 2)  # in MB
    delta_ram = mem_end - mem_begin

    # ---- Evaluate ----
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu()) ** 2).item()

    # ---- Log Resources ----
    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()

    mse_l_love.append(mse)
    time_l_love.append(elapsed)
    vram_l_love.append(peak_alloc)
    ram_l_love.append(delta_ram)

    print(f"LoVE Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, RAM Δ={delta_ram:.2f}MB, VRAM={peak_alloc:.2f}MB")

    # ---- Cleanup ----
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


Train 1: 100%|██████████| 10/10 [00:05<00:00,  1.85it/s, loss=0.829]


[PyTorch] Max Allocated: 6743.16 MB | Max Reserved: 16652.00 MB
[GPU VRAM] Used (nvidia-smi): 16345.73 MB | [System RAM]: 26.56 GB
LoVE Rep 1: MSE=0.2994, Time=5.40s, RAM Δ=25.19MB, VRAM=6743.16MB

=== Replicate 2/11 ===


Train 2: 100%|██████████| 10/10 [00:08<00:00,  1.21it/s, loss=0.835]


[PyTorch] Max Allocated: 6826.47 MB | Max Reserved: 13994.00 MB
[GPU VRAM] Used (nvidia-smi): 15124.86 MB | [System RAM]: 25.65 GB
LoVE Rep 2: MSE=0.2950, Time=8.24s, RAM Δ=6.14MB, VRAM=6826.47MB

=== Replicate 3/11 ===


Train 3: 100%|██████████| 10/10 [00:05<00:00,  1.92it/s, loss=0.831]


[PyTorch] Max Allocated: 6827.76 MB | Max Reserved: 14178.00 MB
[GPU VRAM] Used (nvidia-smi): 15309.05 MB | [System RAM]: 25.40 GB
LoVE Rep 3: MSE=0.2986, Time=5.22s, RAM Δ=16.01MB, VRAM=6827.76MB

=== Replicate 4/11 ===


Train 4: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s, loss=0.858]


[PyTorch] Max Allocated: 6829.24 MB | Max Reserved: 13974.00 MB
[GPU VRAM] Used (nvidia-smi): 15105.36 MB | [System RAM]: 25.48 GB
LoVE Rep 4: MSE=0.3001, Time=5.15s, RAM Δ=11.27MB, VRAM=6829.24MB

=== Replicate 5/11 ===


Train 5: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s, loss=0.83]


[PyTorch] Max Allocated: 6829.27 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15291.80 MB | [System RAM]: 25.37 GB
LoVE Rep 5: MSE=0.3007, Time=5.15s, RAM Δ=-99.93MB, VRAM=6829.27MB

=== Replicate 6/11 ===


Train 6: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s, loss=0.827]


[PyTorch] Max Allocated: 6830.70 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15291.80 MB | [System RAM]: 25.36 GB
LoVE Rep 6: MSE=0.3000, Time=5.15s, RAM Δ=9.57MB, VRAM=6830.70MB

=== Replicate 7/11 ===


Train 7: 100%|██████████| 10/10 [00:05<00:00,  1.95it/s, loss=0.833]


[PyTorch] Max Allocated: 6827.37 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15291.42 MB | [System RAM]: 25.34 GB
LoVE Rep 7: MSE=0.3000, Time=5.13s, RAM Δ=-5.06MB, VRAM=6827.37MB

=== Replicate 8/11 ===


Train 8: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s, loss=0.828]


[PyTorch] Max Allocated: 6829.40 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15291.48 MB | [System RAM]: 25.32 GB
LoVE Rep 8: MSE=0.2980, Time=5.16s, RAM Δ=-2.70MB, VRAM=6829.40MB

=== Replicate 9/11 ===


Train 9: 100%|██████████| 10/10 [00:05<00:00,  1.93it/s, loss=0.828]


[PyTorch] Max Allocated: 6828.76 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15294.23 MB | [System RAM]: 25.37 GB
LoVE Rep 9: MSE=0.3010, Time=5.17s, RAM Δ=-3.19MB, VRAM=6828.76MB

=== Replicate 10/11 ===


Train 10: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s, loss=0.831]


[PyTorch] Max Allocated: 6829.68 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15296.67 MB | [System RAM]: 25.32 GB
LoVE Rep 10: MSE=0.2997, Time=5.25s, RAM Δ=-66.79MB, VRAM=6829.68MB

=== Replicate 11/11 ===


Train 11: 100%|██████████| 10/10 [00:05<00:00,  1.93it/s, loss=0.83]


[PyTorch] Max Allocated: 6829.37 MB | Max Reserved: 14158.00 MB
[GPU VRAM] Used (nvidia-smi): 15296.67 MB | [System RAM]: 25.29 GB
LoVE Rep 11: MSE=0.2997, Time=5.17s, RAM Δ=-3.50MB, VRAM=6829.37MB


In [28]:
print(statistics.mean(mse_l_love[1:]))
print(statistics.stdev(mse_l_love[1:]))

print(statistics.mean(time_l_love[1:]))
print(statistics.stdev(time_l_love[1:]))


0.2992790758609772
0.0017231493648240308
5.47954113483429
0.9719836564032389


# DKL

In [15]:
# Deep Kernel Learning
max_vram = 0
gc.collect()
if gpu:
    torch.cuda.empty_cache()

In [16]:
import torch
import gpytorch
import tqdm
import time
import gc
import numpy as np
import psutil


# -----------------------------
# DKL Feature Extractor
# -----------------------------
class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self, input_dim):
        super().__init__()
        self.add_module('linear1', torch.nn.Linear(input_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 1))

# -----------------------------
# GPRegressionModel
# -----------------------------
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, input_dim):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=1)
            ),
            num_dims=1, grid_size=100
        )
        self.feature_extractor = LargeFeatureExtractor(input_dim=input_dim)
        self.scale_to_bounds = gpytorch.utils.grid.ScaleToBounds(-1., 1.)

    def forward(self, x):
        projected_x = self.feature_extractor(x)
        projected_x = self.scale_to_bounds(projected_x)
        mean_x = self.mean_module(projected_x)
        covar_x = self.covar_module(projected_x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# -----------------------------
# Experiment Parameters
# -----------------------------
n_replicates = 11
training_iterations = 10#80
n_train, n_test = 400000, 20000
random_state = 42
gpu = torch.cuda.is_available()

mse_l_dkl = []
time_l_dkl = []
vram_l_dkl = []
ram_l_dkl = []

# -----------------------------
# Replicates Loop
# -----------------------------
for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")

    # ---- Split Data ----
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=n_train, n_test=n_test,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )

    # ---- RAM before model ----
    mem_begin = psutil.virtual_memory().used / (1024 ** 2)  # in MB

    # ---- Initialize ----
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood, input_dim=train_x.size(-1))
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam([
        {'params': model.feature_extractor.parameters()},
        {'params': model.covar_module.parameters()},
        {'params': model.mean_module.parameters()},
        {'params': model.likelihood.parameters()},
    ], lr=0.01)

    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if gpu:
        mll = mll.cuda()

    torch.cuda.reset_peak_memory_stats()

    # ---- Train ----
    start = time.time()
    iterator = tqdm.tqdm(range(training_iterations), desc=f"Train {rep + 1}")
    for it in iterator:
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        optimizer.step()
        iterator.set_postfix(loss=loss.item())
    elapsed = time.time() - start

    # ---- RAM after ----
    mem_end = psutil.virtual_memory().used / (1024 ** 2)
    delta_ram = mem_end - mem_begin

    # ---- Evaluate ----
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu()) ** 2).item()

    # ---- Log Resources ----
    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()

    mse_l_dkl.append(mse)
    time_l_dkl.append(elapsed)
    vram_l_dkl.append(peak_alloc)
    ram_l_dkl.append(delta_ram)

    print(f"DKL Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, RAM Δ={delta_ram:.2f}MB, VRAM={peak_alloc:.2f}MB")

    # ---- Cleanup ----
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


Train 1: 100%|██████████| 10/10 [00:05<00:00,  1.86it/s, loss=0.979]


[PyTorch] Max Allocated: 6822.24 MB | Max Reserved: 13550.00 MB
[GPU VRAM] Used (nvidia-smi): 14969.09 MB | [System RAM]: 23.44 GB
DKL Rep 1: MSE=0.3624, Time=5.39s, RAM Δ=32.34MB, VRAM=6822.24MB

=== Replicate 2/11 ===


Train 2: 100%|██████████| 10/10 [00:05<00:00,  1.84it/s, loss=1]   


[PyTorch] Max Allocated: 6784.74 MB | Max Reserved: 13552.00 MB
[GPU VRAM] Used (nvidia-smi): 14979.32 MB | [System RAM]: 23.55 GB
DKL Rep 2: MSE=0.7973, Time=5.45s, RAM Δ=87.16MB, VRAM=6784.74MB

=== Replicate 3/11 ===


Train 3: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s, loss=0.995]


[PyTorch] Max Allocated: 6784.74 MB | Max Reserved: 13368.00 MB
[GPU VRAM] Used (nvidia-smi): 14812.01 MB | [System RAM]: 23.48 GB
DKL Rep 3: MSE=0.3661, Time=5.32s, RAM Δ=-62.34MB, VRAM=6784.74MB

=== Replicate 4/11 ===


Train 4: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s, loss=0.997]


[PyTorch] Max Allocated: 6785.18 MB | Max Reserved: 13092.00 MB
[GPU VRAM] Used (nvidia-smi): 14537.60 MB | [System RAM]: 23.47 GB
DKL Rep 4: MSE=0.6384, Time=5.32s, RAM Δ=35.53MB, VRAM=6785.18MB

=== Replicate 5/11 ===


Train 5: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s, loss=0.994]


[PyTorch] Max Allocated: 6783.89 MB | Max Reserved: 13092.00 MB
[GPU VRAM] Used (nvidia-smi): 14551.52 MB | [System RAM]: 23.50 GB
DKL Rep 5: MSE=1.2815, Time=5.32s, RAM Δ=8.20MB, VRAM=6783.89MB

=== Replicate 6/11 ===


Train 6: 100%|██████████| 10/10 [00:05<00:00,  1.86it/s, loss=0.993]


[PyTorch] Max Allocated: 6777.20 MB | Max Reserved: 13460.00 MB
[GPU VRAM] Used (nvidia-smi): 14919.08 MB | [System RAM]: 23.40 GB
DKL Rep 6: MSE=0.5832, Time=5.37s, RAM Δ=-92.55MB, VRAM=6777.20MB

=== Replicate 7/11 ===


Train 7: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s, loss=1.22]


[PyTorch] Max Allocated: 6785.01 MB | Max Reserved: 13460.00 MB
[GPU VRAM] Used (nvidia-smi): 15245.32 MB | [System RAM]: 23.43 GB
DKL Rep 7: MSE=1.2984, Time=5.16s, RAM Δ=16.16MB, VRAM=6785.01MB

=== Replicate 8/11 ===


Train 8: 100%|██████████| 10/10 [00:05<00:00,  1.83it/s, loss=1.01]


[PyTorch] Max Allocated: 6776.99 MB | Max Reserved: 13458.00 MB
[GPU VRAM] Used (nvidia-smi): 15216.32 MB | [System RAM]: 23.41 GB
DKL Rep 8: MSE=1.0244, Time=5.47s, RAM Δ=17.54MB, VRAM=6776.99MB

=== Replicate 9/11 ===


Train 9: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s, loss=0.993]


[PyTorch] Max Allocated: 6782.92 MB | Max Reserved: 13182.00 MB
[GPU VRAM] Used (nvidia-smi): 14950.12 MB | [System RAM]: 23.66 GB
DKL Rep 9: MSE=1.1792, Time=5.26s, RAM Δ=274.23MB, VRAM=6782.92MB

=== Replicate 10/11 ===


Train 10: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s, loss=0.996]


[PyTorch] Max Allocated: 6784.72 MB | Max Reserved: 13460.00 MB
[GPU VRAM] Used (nvidia-smi): 15248.17 MB | [System RAM]: 23.75 GB
DKL Rep 10: MSE=0.5893, Time=5.26s, RAM Δ=66.23MB, VRAM=6784.72MB

=== Replicate 11/11 ===


Train 11: 100%|██████████| 10/10 [00:05<00:00,  1.87it/s, loss=0.971]


[PyTorch] Max Allocated: 6785.19 MB | Max Reserved: 13460.00 MB
[GPU VRAM] Used (nvidia-smi): 15359.80 MB | [System RAM]: 23.71 GB
DKL Rep 11: MSE=1.1768, Time=5.34s, RAM Δ=-3.68MB, VRAM=6785.19MB


In [17]:
print(statistics.mean(mse_l_dkl[1:]))
print(statistics.stdev(mse_l_dkl[1:]))

print(statistics.mean(time_l_dkl[1:]))
print(statistics.stdev(time_l_dkl[1:]))

0.8934573709964753
0.33908235064845654
5.327223777770996
0.09148011857315932


# SVGP CI

In [18]:
# SVGP_CI
max_vram = 0
gc.collect()
if gpu:
    torch.cuda.empty_cache()

In [19]:
import torch
import gpytorch
import tqdm
import time
import gc
import numpy as np
import psutil
from torch.utils.data import DataLoader, TensorDataset


class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.CiqVariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2)
        )
        self.covar_module.base_kernel.initialize(lengthscale=0.01)

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


n_replicates = 11
num_epochs = 10#5
batch_size = 3200
random_state = 42
gpu = torch.cuda.is_available()

mse_l_svgpci = []
time_l_svgpci = []
vram_l_svgpci = []
ram_l_svgpci = []


for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=400000, n_test=20000,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )

    mem_begin = psutil.virtual_memory().used / (1024 ** 2)
    torch.linspace(train_x.min(), train_x.max(), steps=80).unsqueeze(-1)
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataset = TensorDataset(test_x, test_y)
    inducing_points = torch.linspace(0.2, 0.8, 100).unsqueeze(-1)
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()

    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()

    variational_ngd_optimizer = gpytorch.optim.NGD(
        model.variational_parameters(), num_data=train_y.size(0), lr=0.1
    )
    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.002)

    model.train()
    likelihood.train()
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    epochs_iter = tqdm.tqdm(range(num_epochs), desc=f"Epoch {rep + 1}")
    for epoch in epochs_iter:
        minibatch_iter = tqdm.tqdm(train_loader, desc="Minibatch", leave=False, position=0)
        for x_batch, y_batch in minibatch_iter:
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            variational_ngd_optimizer.step()
            hyperparameter_optimizer.step()
            minibatch_iter.set_postfix(loss=loss.item())
    elapsed = time.time() - start
    mem_end = psutil.virtual_memory().used / (1024 ** 2)
    delta_ram = mem_end - mem_begin
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu()) ** 2).item()
    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()
    mse_l_svgpci.append(mse)
    time_l_svgpci.append(elapsed)
    vram_l_svgpci.append(peak_alloc)
    ram_l_svgpci.append(delta_ram)
    print(f"SVGP-CIQ Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, RAM Δ={delta_ram:.2f}MB, VRAM={peak_alloc:.2f}MB")
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


Epoch 1: 100%|██████████| 10/10 [01:44<00:00, 10.41s/it]                


[PyTorch] Max Allocated: 734.46 MB | Max Reserved: 784.00 MB
[GPU VRAM] Used (nvidia-smi): 2548.07 MB | [System RAM]: 23.88 GB
SVGP-CIQ Rep 1: MSE=0.3011, Time=104.13s, RAM Δ=185.08MB, VRAM=734.46MB

=== Replicate 2/11 ===


Epoch 2: 100%|██████████| 10/10 [01:44<00:00, 10.45s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 898.00 MB
[GPU VRAM] Used (nvidia-smi): 2520.16 MB | [System RAM]: 23.90 GB
SVGP-CIQ Rep 2: MSE=0.3024, Time=104.52s, RAM Δ=22.14MB, VRAM=731.37MB

=== Replicate 3/11 ===


Epoch 3: 100%|██████████| 10/10 [01:45<00:00, 10.59s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2462.62 MB | [System RAM]: 23.87 GB
SVGP-CIQ Rep 3: MSE=0.2986, Time=105.87s, RAM Δ=-27.45MB, VRAM=731.37MB

=== Replicate 4/11 ===


Epoch 4: 100%|██████████| 10/10 [01:42<00:00, 10.27s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2370.94 MB | [System RAM]: 23.96 GB
SVGP-CIQ Rep 4: MSE=0.3028, Time=102.73s, RAM Δ=90.41MB, VRAM=731.37MB

=== Replicate 5/11 ===


Epoch 5: 100%|██████████| 10/10 [01:42<00:00, 10.29s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2453.15 MB | [System RAM]: 23.94 GB
SVGP-CIQ Rep 5: MSE=0.3057, Time=102.91s, RAM Δ=-4.32MB, VRAM=731.37MB

=== Replicate 6/11 ===


Epoch 6: 100%|██████████| 10/10 [01:42<00:00, 10.26s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2328.50 MB | [System RAM]: 23.95 GB
SVGP-CIQ Rep 6: MSE=0.3074, Time=102.59s, RAM Δ=6.32MB, VRAM=731.37MB

=== Replicate 7/11 ===


Epoch 7: 100%|██████████| 10/10 [01:50<00:00, 11.03s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2321.69 MB | [System RAM]: 23.95 GB
SVGP-CIQ Rep 7: MSE=0.3050, Time=110.27s, RAM Δ=-1.09MB, VRAM=731.37MB

=== Replicate 8/11 ===


Epoch 8: 100%|██████████| 10/10 [01:43<00:00, 10.32s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2321.88 MB | [System RAM]: 23.97 GB
SVGP-CIQ Rep 8: MSE=0.3022, Time=103.23s, RAM Δ=31.02MB, VRAM=731.37MB

=== Replicate 9/11 ===


Epoch 9: 100%|██████████| 10/10 [01:43<00:00, 10.32s/it]                


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2318.62 MB | [System RAM]: 23.99 GB
SVGP-CIQ Rep 9: MSE=0.3050, Time=103.19s, RAM Δ=21.62MB, VRAM=731.37MB

=== Replicate 10/11 ===


Epoch 10: 100%|██████████| 10/10 [01:48<00:00, 10.81s/it]               


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2320.75 MB | [System RAM]: 24.34 GB
SVGP-CIQ Rep 10: MSE=0.3042, Time=108.09s, RAM Δ=358.32MB, VRAM=731.37MB

=== Replicate 11/11 ===


Epoch 11: 100%|██████████| 10/10 [01:43<00:00, 10.32s/it]               


[PyTorch] Max Allocated: 731.37 MB | Max Reserved: 896.00 MB
[GPU VRAM] Used (nvidia-smi): 2316.25 MB | [System RAM]: 24.44 GB
SVGP-CIQ Rep 11: MSE=0.3020, Time=103.18s, RAM Δ=97.77MB, VRAM=731.37MB


In [20]:
print(statistics.mean(mse_l_svgpci[1:]))
print(statistics.stdev(mse_l_svgpci[1:]))

print(statistics.mean(time_l_svgpci[1:]))
print(statistics.stdev(time_l_svgpci[1:]))

0.3035478860139847
0.002477861637851579
104.6592376947403
2.6280065018581458


# NGD

In [23]:
# NGD
max_vram = 0
gc.collect()
torch.cuda.empty_cache()

In [24]:
import torch
import gpytorch
import tqdm
import time
import gc
import numpy as np
import psutil
from torch.utils.data import TensorDataset, DataLoader



class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=False
        )
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

n_replicates = 11
num_epochs = 10#20#60
batch_size = 3200
random_state = 42
gpu = torch.cuda.is_available()

mse_l_ngd = []
time_l_ngd = []
vram_l_ngd = []
ram_l_ngd = []


for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=400000, n_test=20000,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )
    mem_begin = psutil.virtual_memory().used / (1024 ** 2)
    inducing_points = torch.linspace(train_x.min(), train_x.max(), steps=50).unsqueeze(-1)
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataset = TensorDataset(test_x, test_y)
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    variational_ngd_optimizer = gpytorch.optim.NGD(
        model.variational_parameters(), num_data=train_y.size(0), lr=0.001
    )
    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.1)
    model.train()
    likelihood.train()
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    epochs_iter = tqdm.tqdm(range(num_epochs), desc=f"Epoch {rep + 1}", leave=False, position=0)
    for epoch in epochs_iter:
        minibatch_iter = tqdm.tqdm(train_loader, desc="Minibatch", leave=False, position=0)
        for x_batch, y_batch in minibatch_iter:
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            variational_ngd_optimizer.step()
            hyperparameter_optimizer.step()
            minibatch_iter.set_postfix(loss=loss.item())
    elapsed = time.time() - start

    # ---- RAM after ----
    mem_end = psutil.virtual_memory().used / (1024 ** 2)
    delta_ram = mem_end - mem_begin

    # ---- Evaluate ----
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        observed_pred = likelihood(model(test_x))
    means = observed_pred.mean.cpu()
    mse = torch.mean((means - test_y.cpu()) ** 2).item()

    # ---- Log Resources ----
    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()

    mse_l_ngd.append(mse)
    time_l_ngd.append(elapsed)
    vram_l_ngd.append(peak_alloc)
    ram_l_ngd.append(delta_ram)

    print(f"SVGP-NGD Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, RAM Δ={delta_ram:.2f}MB, VRAM={peak_alloc:.2f}MB")

    # ---- Cleanup ----
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


                                                                        

[PyTorch] Max Allocated: 45.05 MB | Max Reserved: 62.00 MB
[GPU VRAM] Used (nvidia-smi): 1570.41 MB | [System RAM]: 24.37 GB
SVGP-NGD Rep 1: MSE=0.3027, Time=34.03s, RAM Δ=1.39MB, VRAM=45.05MB

=== Replicate 2/11 ===


                                                                        

[PyTorch] Max Allocated: 53.14 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1582.47 MB | [System RAM]: 24.36 GB
SVGP-NGD Rep 2: MSE=0.3007, Time=37.76s, RAM Δ=-13.51MB, VRAM=53.14MB

=== Replicate 3/11 ===


                                                                        

[PyTorch] Max Allocated: 53.53 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1582.03 MB | [System RAM]: 24.35 GB
SVGP-NGD Rep 3: MSE=0.3006, Time=35.54s, RAM Δ=31.96MB, VRAM=53.53MB

=== Replicate 4/11 ===


                                                                        

[PyTorch] Max Allocated: 53.14 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1577.52 MB | [System RAM]: 24.31 GB
SVGP-NGD Rep 4: MSE=0.3035, Time=35.33s, RAM Δ=-46.11MB, VRAM=53.14MB

=== Replicate 5/11 ===


                                                                        

[PyTorch] Max Allocated: 53.53 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1615.95 MB | [System RAM]: 24.38 GB
SVGP-NGD Rep 5: MSE=0.3092, Time=34.29s, RAM Δ=67.24MB, VRAM=53.53MB

=== Replicate 6/11 ===


                                                                        

[PyTorch] Max Allocated: 53.14 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1593.12 MB | [System RAM]: 24.36 GB
SVGP-NGD Rep 6: MSE=0.3088, Time=35.22s, RAM Δ=-17.62MB, VRAM=53.14MB

=== Replicate 7/11 ===


                                                                        

[PyTorch] Max Allocated: 53.53 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1604.34 MB | [System RAM]: 24.40 GB
SVGP-NGD Rep 7: MSE=0.3037, Time=33.68s, RAM Δ=35.67MB, VRAM=53.53MB

=== Replicate 8/11 ===


                                                                        

[PyTorch] Max Allocated: 53.14 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1900.88 MB | [System RAM]: 24.66 GB
SVGP-NGD Rep 8: MSE=0.3033, Time=32.68s, RAM Δ=263.70MB, VRAM=53.14MB

=== Replicate 9/11 ===


                                                                        

[PyTorch] Max Allocated: 53.53 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1900.73 MB | [System RAM]: 24.62 GB
SVGP-NGD Rep 9: MSE=0.3075, Time=34.49s, RAM Δ=-31.50MB, VRAM=53.53MB

=== Replicate 10/11 ===


                                                                        

[PyTorch] Max Allocated: 53.14 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1887.16 MB | [System RAM]: 24.63 GB
SVGP-NGD Rep 10: MSE=0.3056, Time=33.23s, RAM Δ=2.68MB, VRAM=53.14MB

=== Replicate 11/11 ===


                                                                        

[PyTorch] Max Allocated: 53.53 MB | Max Reserved: 82.00 MB
[GPU VRAM] Used (nvidia-smi): 1896.39 MB | [System RAM]: 24.52 GB
SVGP-NGD Rep 11: MSE=0.3020, Time=34.21s, RAM Δ=-124.29MB, VRAM=53.53MB


In [25]:
print(statistics.mean(mse_l_ngd[1:]))
print(statistics.stdev(mse_l_ngd[1:]))

print(statistics.mean(time_l_ngd[1:]))
print(statistics.stdev(time_l_ngd[1:]))

0.30449849665164946
0.0031727239269797457
34.64251048564911
1.4321243837925521


# VNN

In [None]:
# VNN
max_vram = 0
gc.collect()
if gpu:
    torch.cuda.empty_cache()

In [None]:
import faiss
import gc
import time
import statistics
import torch
import gpytorch
import faiss
from torch.utils.data import TensorDataset, DataLoader
import tqdm
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
class GPModel(ApproximateGP):
    def __init__(self, inducing_points, likelihood, k=256, training_batch_size=256):
        m, d = inducing_points.shape
        self.m = m
        self.k = k
        variational_distribution = gpytorch.variational.MeanFieldVariationalDistribution(m)
        if gpu:
            inducing_points = inducing_points.cuda()
        variational_strategy = NNVariationalStrategy(
            self, inducing_points, variational_distribution,
            k=k, training_batch_size=training_batch_size
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ZeroMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=d)
        )
        self.likelihood = likelihood

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()  # assumed to be defined elsewhere
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

    def __call__(self, x, prior=False, **kwargs):
        if x is not None:
            if x.dim() == 1:
                x = x.unsqueeze(-1)
        return self.variational_strategy(x=x, prior=False, **kwargs)


n_replicates = 11
num_epochs = 10#10#30
random_state = 42
gpu = torch.cuda.is_available()

if False:
    k = 32
    training_batch_size = 32
else:
    k = 160#320
    training_batch_size = 320 * 4

mse_l_vnn = []
time_l_vnn = []
vram_l_vnn = []
ram_l_vnn = []


for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")

    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=400000, n_test=20000,
        random_state=random_state + rep,
        move_to_gpu=gpu
    )

    batch_size = 32
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataset = TensorDataset(test_x, test_y)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


    mem_begin = psutil.virtual_memory().used / (1024 ** 2)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPModel(
        inducing_points=train_x[::1].contiguous(),
        likelihood=likelihood,
        k=k,
        training_batch_size=training_batch_size
    )

    if gpu:
        likelihood = likelihood.cuda()
        model = model.cuda()

    num_batches = model.variational_strategy._total_training_batches

    model.train()
    likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    epochs_iter = tqdm.tqdm(range(num_epochs), desc=f"Epoch {rep + 1}")
    for epoch in epochs_iter:
        minibatch_iter = tqdm.tqdm(range(num_batches), leave=False, position=0)
        for batch_idx in minibatch_iter:
            optimizer.zero_grad()
            output = model(x=None)
            current_indices = model.variational_strategy.current_training_indices
            y_batch = train_y[..., current_indices]
            if gpu:
                y_batch = y_batch.cuda()
            loss = -mll(output, y_batch)
            loss.backward()
            optimizer.step()
            minibatch_iter.set_postfix(loss=loss.item())
    elapsed = time.time() - start


    mem_end = psutil.virtual_memory().used / (1024 ** 2)
    delta_ram = mem_end - mem_begin

    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, _ in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    mse = torch.mean((means - test_y.cpu()) ** 2).item()

    peak_alloc, peak_reserved, gpu_used, sys_used = log_memory()
    mse_l_vnn.append(mse)
    time_l_vnn.append(elapsed)
    vram_l_vnn.append(peak_alloc)
    ram_l_vnn.append(delta_ram)

    print(f"VNN Rep {rep + 1}: MSE={mse:.4f}, Time={elapsed:.2f}s, RAM Δ={delta_ram:.2f}MB, VRAM={peak_alloc:.2f}MB")
    del model, likelihood
    gc.collect()
    if gpu:
        torch.cuda.empty_cache()


In [None]:
print(statistics.mean(mse_l_vnn[1:]))
print(statistics.stdev(mse_l_vnn[1:]))

print(statistics.mean(time_l_vnn[1:]))
print(statistics.stdev(time_l_vnn[1:]))

print("VNN     --- MSE:", round(statistics.mean(mse_l_vnn[1:]), 4), "(", round(statistics.stdev(mse_l_vnn[1:]), 4), ")  Time:", round(statistics.mean(time_l_vnn[1:]), 4), "(", round(statistics.stdev(time_l_vnn[1:]), 4), ")")

# Compile Table (MSE and Time only)
Order:
SKI
SGPR
LOVE
DKL
SVGP-CI
SVGP
NGD
VNN



In [None]:
print("SKI     --- MSE:", statistics.mean(mse_l_ski[1:]), "(", statistics.stdev(mse_l_ski[1:]), ")  Time:", statistics.mean(time_l_ski[1:]), "(", statistics.stdev(time_l_ski[1:]), ")")
print("SGPR    --- MSE:", statistics.mean(mse_l_sgpr[1:]), "(", statistics.stdev(mse_l_sgpr[1:]), ")  Time:", statistics.mean(time_l_sgpr[1:]), "(", statistics.stdev(time_l_sgpr[1:]), ")")
print("LOVE    --- MSE:", statistics.mean(mse_l_love[1:]), "(", statistics.stdev(mse_l_love[1:]), ")  Time:", statistics.mean(time_l_love[1:]), "(", statistics.stdev(time_l_love[1:]), ")")
print("DKL     --- MSE:", statistics.mean(mse_l_dkl[1:]), "(", statistics.stdev(mse_l_dkl[1:]), ")  Time:", statistics.mean(time_l_dkl[1:]), "(", statistics.stdev(time_l_dkl[1:]), ")")
print("SVGP-CI --- MSE:", statistics.mean(mse_l_svgpci[1:]), "(", statistics.stdev(mse_l_svgpci[1:]), ")  Time:", statistics.mean(time_l_svgpci[1:]), "(", statistics.stdev(time_l_svgpci[1:]), ")")
print("SVGP    --- MSE:", statistics.mean(mse_l_svgp[1:]), "(", statistics.stdev(mse_l_svgp[1:]), ")  Time:", statistics.mean(time_l_svgp[1:]), "(", statistics.stdev(time_l_svgp[1:]), ")")
print("NGD     --- MSE:", statistics.mean(mse_l_ngd[1:]), "(", statistics.stdev(mse_l_ngd[1:]), ")  Time:", statistics.mean(time_l_ngd[1:]), "(", statistics.stdev(time_l_ngd[1:]), ")")
print("VNN     --- MSE:", statistics.mean(mse_l_vnn[1:]), "(", statistics.stdev(mse_l_vnn[1:]), ")  Time:", statistics.mean(time_l_vnn[1:]), "(", statistics.stdev(time_l_vnn[1:]), ")")


Reordering to match the table in the paper:
SVGP
SVGP-CI
VNN
NGD
DKL
SGPR
SKI
LOVE

In [None]:
print("SVGP    --- MSE:", statistics.mean(mse_l_svgp[1:]), "(", statistics.stdev(mse_l_svgp[1:]), ")  Time:", statistics.mean(time_l_svgp[1:]), "(", statistics.stdev(time_l_svgp[1:]), ")")
print("SVGP-CI --- MSE:", statistics.mean(mse_l_svgpci[1:]), "(", statistics.stdev(mse_l_svgpci[1:]), ")  Time:", statistics.mean(time_l_svgpci[1:]), "(", statistics.stdev(time_l_svgpci[1:]), ")")
print("VNN     --- MSE:", statistics.mean(mse_l_vnn[1:]), "(", statistics.stdev(mse_l_vnn[1:]), ")  Time:", statistics.mean(time_l_vnn[1:]), "(", statistics.stdev(time_l_vnn[1:]), ")")
print("NGD     --- MSE:", statistics.mean(mse_l_ngd[1:]), "(", statistics.stdev(mse_l_ngd[1:]), ")  Time:", statistics.mean(time_l_ngd[1:]), "(", statistics.stdev(time_l_ngd[1:]), ")")
print("DKL     --- MSE:", statistics.mean(mse_l_dkl[1:]), "(", statistics.stdev(mse_l_dkl[1:]), ")  Time:", statistics.mean(time_l_dkl[1:]), "(", statistics.stdev(time_l_dkl[1:]), ")")
print("SGPR    --- MSE:", statistics.mean(mse_l_sgpr[1:]), "(", statistics.stdev(mse_l_sgpr[1:]), ")  Time:", statistics.mean(time_l_sgpr[1:]), "(", statistics.stdev(time_l_sgpr[1:]), ")")
print("SKI     --- MSE:", statistics.mean(mse_l_ski[1:]), "(", statistics.stdev(mse_l_ski[1:]), ")  Time:", statistics.mean(time_l_ski[1:]), "(", statistics.stdev(time_l_ski[1:]), ")")
print("LOVE    --- MSE:", statistics.mean(mse_l_love[1:]), "(", statistics.stdev(mse_l_love[1:]), ")  Time:", statistics.mean(time_l_love[1:]), "(", statistics.stdev(time_l_love[1:]), ")")

In [None]:
print("SVGP    --- MSE:", round(statistics.mean(mse_l_svgp[1:]), 4), "(", round(statistics.stdev(mse_l_svgp[1:]), 4), ")  Time:", round(statistics.mean(time_l_svgp[1:]), 4), "(", round(statistics.stdev(time_l_svgp[1:]), 4), ")")
print("SVGP-CI --- MSE:", round(statistics.mean(mse_l_svgpci[1:]), 4), "(", round(statistics.stdev(mse_l_svgpci[1:]), 4), ")  Time:", round(statistics.mean(time_l_svgpci[1:]), 4), "(", round(statistics.stdev(time_l_svgpci[1:]), 4), ")")
#print("VNN     --- MSE:", round(statistics.mean(mse_l_vnn[1:]), 4), "(", round(statistics.stdev(mse_l_vnn[1:]), 4), ")  Time:", round(statistics.mean(time_l_vnn[1:]), 4), "(", round(statistics.stdev(time_l_vnn[1:]), 4), ")")
print("NGD     --- MSE:", round(statistics.mean(mse_l_ngd[1:]), 4), "(", round(statistics.stdev(mse_l_ngd[1:]), 4), ")  Time:", round(statistics.mean(time_l_ngd[1:]), 4), "(", round(statistics.stdev(time_l_ngd[1:]), 4), ")")
print("DKL     --- MSE:", round(statistics.mean(mse_l_dkl[1:]), 4), "(", round(statistics.stdev(mse_l_dkl[1:]), 4), ")  Time:", round(statistics.mean(time_l_dkl[1:]), 4), "(", round(statistics.stdev(time_l_dkl[1:]), 4), ")")
print("SGPR    --- MSE:", round(statistics.mean(mse_l_sgpr[1:]), 4), "(", round(statistics.stdev(mse_l_sgpr[1:]), 4), ")  Time:", round(statistics.mean(time_l_sgpr[1:]), 4), "(", round(statistics.stdev(time_l_sgpr[1:]), 4), ")")
print("SKI     --- MSE:", round(statistics.mean(mse_l_ski[1:]), 4), "(", round(statistics.stdev(mse_l_ski[1:]), 4), ")  Time:", round(statistics.mean(time_l_ski[1:]), 4), "(", round(statistics.stdev(time_l_ski[1:]), 4), ")")
print("LOVE    --- MSE:", round(statistics.mean(mse_l_love[1:]), 4), "(", round(statistics.stdev(mse_l_love[1:]), 4), ")  Time:", round(statistics.mean(time_l_love[1:]), 4), "(", round(statistics.stdev(time_l_love[1:]), 4), ")")


In [None]:
import statistics


methods = [
    ("SVGP",   mse_l_svgp,   time_l_svgp),
    ("SVGP-CI",mse_l_svgpci, time_l_svgpci),
    ("NGD",    mse_l_ngd,    time_l_ngd),
    ("DKL",    mse_l_dkl,    time_l_dkl),
    ("SGPR",   mse_l_sgpr,   time_l_sgpr),
    ("SKI",    mse_l_ski,    time_l_ski),
    ("LOVE",   mse_l_love,   time_l_love),
]

for name, mse_list, time_list in methods:
    data_mse  = mse_list[1:]
    data_time = time_list[1:]
    mean_mse  = statistics.mean(data_mse)
    sd_mse    = statistics.stdev(data_mse)
    mean_time = statistics.mean(data_time)
    sd_time   = statistics.stdev(data_time)
    print(f"{name:<8} & {mean_mse:.4f}  & ({sd_mse:.4f} )  & {mean_time:.4f}  & ({sd_time:.4f} )")
