# Important Note
RAM and VRAM measurements are dependent on the computer state, and should only be interpreted relative to each other. In order to obtain RAM and VRAM measurements, perform the following steps:

1 - Restart the Kernel

2 - Run the "Loading Required Packages and Helper Functions" cell

3 - Run the "Loading Data" cell

4 - Run ONLY ONE iteration of the desired method, and read the RAM and VRAM usage reports printed by the cell

# Loading Required Packages and Helper Functions
If you would like to use Cuda, set gpu = True. Otherwise set gpu = False. 

Step 1: Run the following cell to import the required packages and helper functions. Set the number of replicates desired.

Step 2: Load the Data

Step 3: Execute the cells under the method you wish to replicate.

# Step 1

In [1]:
gpu = True
n_replicates = 2

In [2]:
import math
import torch
import gpytorch
import time
from matplotlib import pyplot as plt
import gc
import statistics
import numpy as np
import tqdm
import psutil

from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
# Make plots inline

%matplotlib inline
%load_ext autoreload
%autoreload 2

def get_mem():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss

max_vram = 0
def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())

from torch.utils.data import TensorDataset, DataLoader



import urllib.request
import os
import pandas as pd
from scipy.io import loadmat
from math import floor

from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL

from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal
from torch.utils.data import TensorDataset, DataLoader

max_vram = 0
max_ram = 0

def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())


# Make plots inline
%matplotlib inline


from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
import faiss



# Loading Data
Step 2: Load the data (note: must run the DataGenerator.Rmd file first)

In [None]:

print(get_mem()/(1024**2))
# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)

csvfile = pd.read_csv('train_x_2d.csv', header = None, dtype=float, delimiter=' ')
train_x = torch.tensor(np.array(csvfile)).float()
#train_x = torch.reshape(train_x,[80000,2])
csvfile = pd.read_csv('train_y_2d.csv', header = None)
train_y = torch.tensor(csvfile[0]).float()
csvfile = pd.read_csv('test_x_2d.csv', header = None, dtype=float, delimiter=' ')
test_x = torch.tensor(np.array(csvfile)).float()
csvfile = pd.read_csv('test_y_2d.csv', header = None)
test_y = torch.tensor(csvfile[0]).float()


train_n = int(len(train_x))
train_x = train_x.contiguous()
train_y = train_y.contiguous()

test_x = test_x.contiguous()
test_y = test_y.contiguous()

if gpu:
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

In [None]:


print(train_x.shape)
print(train_x.size(-1))
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

# Simulations
Step 3: Execute the simulations to be reproduced. If all simulations are run, there is a summarizer at the end. Otherwise, the relevant statistics are printed at the end of each method.

# Deep Kernel

In [None]:
my_batch_size = 32
smoke_test = False

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)

data_dim = train_x.size(-1)
print(data_dim)

class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self):
        super(LargeFeatureExtractor, self).__init__()
        self.add_module('linear1', torch.nn.Linear(data_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 1))

feature_extractor = LargeFeatureExtractor()


class GPRegressionModel(gpytorch.models.ExactGP):
        def __init__(self, train_x, train_y, likelihood):
            super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(  gpytorch.kernels.MaternKernel(nu=1.5)  ),
                num_dims=1, grid_size=100
            )
            self.feature_extractor = feature_extractor

            # This module will scale the NN features so that they're nice values
            self.scale_to_bounds = gpytorch.utils.grid.ScaleToBounds(-1., 1.)

        def forward(self, x):
            # We're first putting our data through a deep net (feature extractor)
            projected_x = self.feature_extractor(x)
            projected_x = self.scale_to_bounds(projected_x)  # Make the NN values "nice"

            mean_x = self.mean_module(projected_x)
            covar_x = self.covar_module(projected_x)
            vram_usage()
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


mse_l_dkl = []
time_l_dkl = []

for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    training_iterations = 60
    
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    
    # Use the adam optimizer
    optimizer = torch.optim.Adam([
        {'params': model.feature_extractor.parameters()},
        {'params': model.covar_module.parameters()},
        {'params': model.mean_module.parameters()},
        {'params': model.likelihood.parameters()},
    ], lr=0.02)# 0.02 for 100k, 0.02 of Dense
    
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    def train():
        iterator = tqdm.tqdm(range(training_iterations), leave = True)
        for i in iterator:
            # Zero backprop gradients
            optimizer.zero_grad()
            # Get output from model
            output = model(train_x)
            # Calc loss and backprop derivatives
            loss = -mll(output, train_y)
            loss.backward()
            iterator.set_postfix(loss=loss.item())
            vram_usage()
            optimizer.step()
    
    
    begin=time.time()
    train()
    uTime = time.time()-begin
    print(uTime)
    #%time train()
    mem_diff = get_mem()-mem_begin
    print("Memory Usage:", mem_diff / (1024 ** 2), "MB")
    print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
    
    model.eval()
    likelihood.eval()
    
    # Test points are regularly spaced along [0,1]
    # Make predictions by feeding model through likelihood
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
    #    test_x = torch.linspace(0, 1, 51)
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    
    means = observed_pred.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_dkl.append(MSE.item())
    time_l_dkl.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    

# print(statistics.mean(mse_l_dkl))
# print(statistics.stdev(mse_l_dkl))

# print(statistics.mean(time_l_dkl))
# print(statistics.stdev(time_l_dkl))

print(round(statistics.mean(mse_l_dkl),5),round(statistics.stdev(mse_l_dkl),5), round(statistics.mean(time_l_dkl),5), round(statistics.stdev(time_l_dkl),5))

# Sparse GP

In [None]:
class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean()
        self.base_covar_module = ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2))
        self.covar_module = InducingPointKernel(self.base_covar_module, inducing_points=train_x[::300].clone(), likelihood=likelihood)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        return MultivariateNormal(mean_x, covar_x)

my_batch_size = 320
smoke_test = False

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)


mse_l_sgpr = []
time_l_sgpr = []

for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    
    training_iterations = 1#350
    
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    
    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    def train():
        iterator = tqdm.tqdm(range(training_iterations), desc="Train")
    
        for i in iterator:
            # Zero backprop gradients
            optimizer.zero_grad()
            # Get output from model
            output = model(train_x)
            # Calc loss and backprop derivatives
            loss = -mll(output, train_y)
            loss.backward()
            iterator.set_postfix(loss=loss.item())
            optimizer.step()
            vram_usage()#(torch.cuda.memory_allocated())
            torch.cuda.empty_cache()
    
    #%time train()
    
    begin = time.time()
    train()
    uTime = time.time()-begin
    print("Time: ", time.time()-begin)
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    # with torch.no_grad():
    #     for x_batch, y_batch in tqdm.tqdm(test_loader):
    #         preds = model(x_batch.cuda())
    #         means = torch.cat([means, preds.mean.cpu()])
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    
    #means = observed_pred.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_sgpr.append(MSE.item())
    time_l_sgpr.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))

# print("Time: ", time.time() - begin)
# print("RAM: ", (get_mem() - mem_begin) / (1024 ** 2))
# print("VRAM: ", max_vram / (1024 ** 2))
print(statistics.mean(mse_l_sgpr))
print(statistics.stdev(mse_l_sgpr))

print(statistics.mean(time_l_sgpr))
print(statistics.stdev(time_l_sgpr))

print(round(statistics.mean(mse_l_sgpr),5),round(statistics.stdev(mse_l_sgpr),5), round(statistics.mean(time_l_sgpr),5), round(statistics.stdev(time_l_sgpr),5))

# LOVE

In [None]:
my_batch_size = 3200
smoke_test = False


train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=320, shuffle=False)


class LargeFeatureExtractor(torch.nn.Sequential):
    def __init__(self, input_dim):
        super(LargeFeatureExtractor, self).__init__()
        self.add_module('linear1', torch.nn.Linear(input_dim, 1000))
        self.add_module('relu1', torch.nn.ReLU())
        self.add_module('linear2', torch.nn.Linear(1000, 500))
        self.add_module('relu2', torch.nn.ReLU())
        self.add_module('linear3', torch.nn.Linear(500, 50))
        self.add_module('relu3', torch.nn.ReLU())
        self.add_module('linear4', torch.nn.Linear(50, 1))
        print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=1)),
            grid_size=100, num_dims=1,
        )

        # Also add the deep net
        self.feature_extractor = LargeFeatureExtractor(input_dim=train_x.size(-1))

    def forward(self, x):
        # We're first putting our data through a deep net (feature extractor)
        # We're also scaling the features so that they're nice values
        projected_x = self.feature_extractor(x)
        projected_x = projected_x - projected_x.min(0)[0]
        projected_x = 2 * (projected_x / projected_x.max(0)[0]) - 1

        # The rest of this looks like what we've seen
        mean_x = self.mean_module(projected_x)
        covar_x = self.covar_module(projected_x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)




mse_l_love = []
time_l_love = []


for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    training_iterations = 40
    
    
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    
    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Includes GaussianLikelihood parameters
    
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    
    def train():
        iterator = tqdm.tqdm(range(training_iterations))
        for i in iterator:
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            iterator.set_postfix(loss=loss.item())
            vram_usage()
            optimizer.step()
    
    #%time train()
    
    begin = time.time()
    train()
    uTime = time.time()-begin
    print("Time: ", time.time()-begin)
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    # with torch.no_grad():
    #     for x_batch, y_batch in tqdm.tqdm(test_loader):
    #         preds = model(x_batch.cuda())
    #         means = torch.cat([means, preds.mean.cpu()])
    
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
    #    test_x = torch.linspace(0, 1, 51)
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    
    means = observed_pred.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_love.append(MSE.item())
    time_l_love.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))

print(statistics.mean(mse_l_love))
print(statistics.stdev(mse_l_love))

print(statistics.mean(time_l_love))
print(statistics.stdev(time_l_love))

print(round(statistics.mean(mse_l_love),5),round(statistics.stdev(mse_l_love),5), round(statistics.mean(time_l_love),5), round(statistics.stdev(time_l_love),5))

# NGD

In [None]:
my_batch_size = 320

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)

class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.VariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=False
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

mse_l_ngd = []
time_l_ngd = []

for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    
    inducing_points = train_x[::100]
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    #variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.00001)#0.001 for 100k, 0.001 for Dense
    variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.01)#0.001 for 100k, 0.001 for Dense
    
    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.1) #0.1 for 100k, 0.1 for Dense
    print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
    
    model.train()
    likelihood.train()
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    #mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
    num_epochs = 5#15
    epochs_iter = tqdm.tqdm(range(num_epochs), desc="Epoch")
    
    begin = time.time()
    
    for i in epochs_iter:
        minibatch_iter = tqdm.tqdm(train_loader, desc="Minibatch", leave=False, position = 0)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        for x_batch, y_batch in minibatch_iter:
            ### Perform NGD step to optimize variational parameters
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            minibatch_iter.set_postfix(loss=loss.item())
            loss.backward()
            variational_ngd_optimizer.step()
            hyperparameter_optimizer.step()
    
    uTime = time.time()-begin
    print("Time: ",time.time()-begin)
    mem_diff = get_mem()-mem_begin
    print("Memory Usage:", (mem_diff) / (1024 ** 2), "MB")
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_ngd.append(MSE.item())
    time_l_ngd.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))


print(statistics.mean(mse_l_ngd))
print(statistics.stdev(mse_l_ngd))

print(statistics.mean(time_l_ngd))
print(statistics.stdev(time_l_ngd))

print(round(statistics.mean(mse_l_ngd),5),round(statistics.stdev(mse_l_ngd),5), round(statistics.mean(time_l_ngd),5), round(statistics.stdev(time_l_ngd),5))

# SVGP_CI

In [None]:
my_batch_size = 3200
smoke_test = False

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)

#inducing_points = train_x[torch.randperm(train_x.size(0))[:200]]
inducing_points = train_x[::1000]

class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(0))
        variational_strategy = gpytorch.variational.CiqVariationalStrategy(
            self, inducing_points, variational_distribution, learn_inducing_locations=True
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=2)
        )
        self.covar_module.base_kernel.initialize(lengthscale=0.01)  # Specific to the 3droad dataset
        print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)



mse_l_svgpci = []
time_l_svgpci = []

for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    
    model = GPModel(inducing_points=inducing_points)
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    if gpu:
        model = model.cuda()
        likelihood = likelihood.cuda()
    
    variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.1)
    
    hyperparameter_optimizer = torch.optim.Adam([
        {'params': model.hyperparameters()},
        {'params': likelihood.parameters()},
    ], lr=0.002) #0.01 for 100k, 0.002 for Dense
    
    model.train()
    likelihood.train()
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    
    num_epochs = 10
    
    begin = time.time()
    epochs_iter = tqdm.tqdm(range(num_epochs), desc="Epoch")
    for i in epochs_iter:
        minibatch_iter = tqdm.tqdm(train_loader, desc="Minibatch", leave=False, position = 0)
    
        for x_batch, y_batch in minibatch_iter:
            variational_ngd_optimizer.zero_grad()
            hyperparameter_optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            minibatch_iter.set_postfix(loss=loss.item())
            loss.backward()
            variational_ngd_optimizer.step()
            vram_usage()
            hyperparameter_optimizer.step()

    uTime = time.time()-begin
    print("Time: ", time.time()-begin)
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    # with torch.no_grad():
    #     for x_batch, y_batch in tqdm.tqdm(test_loader):
    #         preds = model(x_batch.cuda())
    #         means = torch.cat([means, preds.mean.cpu()])
    
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
    #    test_x = torch.linspace(0, 1, 51)
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    
    means = observed_pred.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_svgpci.append(MSE.item())
    time_l_svgpci.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))

print(statistics.mean(mse_l_svgpci))
print(statistics.stdev(mse_l_svgpci))

print(statistics.mean(time_l_svgpci))
print(statistics.stdev(time_l_svgpci))

print(round(statistics.mean(mse_l_svgpci),5),round(statistics.stdev(mse_l_svgpci),5), round(statistics.mean(time_l_svgpci),5), round(statistics.stdev(time_l_svgpci),5))

# SVGP

In [None]:
my_batch_size = 3200

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)

from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy

class GPModel(ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = CholeskyVariationalDistribution(inducing_points.size(0))
        variational_strategy = VariationalStrategy(self, inducing_points, variational_distribution, learn_inducing_locations=False)
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        #self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        #print("RAM: ", (get_mem() - mem_begin) / (1024 ** 2))
        

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        #print("VRAM Usage:", torch.cuda.memory_allocated()/(1024**2) , "MB")
        #print("RAM: ", (get_mem() - mem_begin) / (1024 ** 2))
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

mse_l_svgp = []
time_l_svgp = []


for i in np.arange(0,n_replicates):
    print("Replicate:" ,i)
    mem_begin = get_mem()
    
    inducing_points = train_x[::100]
    #inducing_points = train_x
    model = GPModel(inducing_points=inducing_points)
    #likelihood = gpytorch.likelihoods.GaussianLikelihood()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    if torch.cuda.is_available():
        model = model.cuda()
        likelihood = likelihood.cuda()
    mem_diff = get_mem() - mem_begin
    print("RAM: ", mem_diff / (1024 ** 2))
    
    #num_epochs = 3# if smoke_test else 4
    num_epochs = 20
    
    model.train()
    likelihood.train()
    
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},
        {'params': likelihood.parameters()},
    ], lr=0.001)
    
    #optimizer = torch.optim.SGD([
    #    {'params': model.parameters()},
    #    {'params': likelihood.parameters()},
    #], lr=1)
    
    # Our loss object. We're using the VariationalELBO
    mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    mem_diff = get_mem() - mem_begin
    print("RAM: ", mem_diff / (1024 ** 2))
    
    
    epochs_iter = tqdm.tqdm(range(num_epochs), desc="Epoch")
    
    begin = time.time()
    #for i in epochs_iter:
    for i in tqdm.tqdm(range(num_epochs), leave = False, position = 0):
        # Within each iteration, we will go over each minibatch of data
        minibatch_iter = tqdm.tqdm(train_loader, desc="Minibatch", leave=False, position = 0)
        for x_batch, y_batch in minibatch_iter:
        #for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = -mll(output, y_batch)
            loss.backward()
            max_ram = max(max_ram, (get_mem() - mem_begin))
            optimizer.step()
            if gpu:
                max_vram = max(max_vram, torch.cuda.memory_allocated())
        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            i + 1, num_epochs, loss.item(),
            model.covar_module.base_kernel.lengthscale.item(),
            likelihood.noise.item()
        ))
    uTime = time.time()-begin
    print("Time: ", time.time() - begin)
    mem_diff = get_mem() - mem_begin
    print("RAM: ", max_ram / (1024 ** 2))
    print("VRAM: ", max_vram / (1024 ** 2))
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    # with torch.no_grad():
    #     for x_batch, y_batch in tqdm.tqdm(test_loader):
    #         preds = model(x_batch.cuda())
    #         means = torch.cat([means, preds.mean.cpu()])
     
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
    #    test_x = torch.linspace(0, 1, 51)
        if gpu:
            observed_pred = likelihood(model(test_x.to('cuda')))
        else:
            observed_pred = likelihood(model(test_x))
    
    means = observed_pred.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_svgp.append(MSE.item())
    time_l_svgp.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))

print(statistics.mean(mse_l_svgp))
print(statistics.stdev(mse_l_svgp))

print(statistics.mean(time_l_svgp))
print(statistics.stdev(time_l_svgp))

print(round(statistics.mean(mse_l_svgp),5),round(statistics.stdev(mse_l_svgp),5), round(statistics.mean(time_l_svgp),5), round(statistics.stdev(time_l_svgp),5))

# SKI - Can only handle up to 40,000 datapoints before running out of memory

In [11]:

model = None
likelihood = None

if gpu:
    gc.collect()


In [12]:
train_x_ski = train_x[::2]
train_y_ski = train_y[::2]

if gpu:
    train_x_ski, train_y_ski = train_x_ski.cuda(), train_y_ski.cuda()

In [None]:



class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

        # SKI requires a grid size hyperparameter. This util can help with that
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x, 1) #1/50

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.RBFKernel(), grid_size=grid_size, num_dims=2
            )
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# this is for running the notebook in our testing framework
import os
smoke_test = ('CI' in os.environ)
training_iterations = 32


mse_l_ski = []
time_l_ski = []


for i in np.arange(0,n_replicates):
    mem_begin = get_mem()
    
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x_ski, train_y_ski, likelihood)
    
    
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    
    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)  #0.001 Includes GaussianLikelihood parameters
    
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if gpu:
        mll = mll.cuda()


    # # Find optimal model hyperparameters
    # model.train()
    # likelihood.train()
    
    # # Use the adam optimizer
    # optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Includes GaussianLikelihood parameters
    
    # # "Loss" for GPs - the marginal log likelihood
    # mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    training_iterations = 15
    begin = time.time()
    
    for i in tqdm.tqdm(range(training_iterations), desc="Train", leave = False, position = 0 ):
        optimizer.zero_grad()
        if gpu:
            max_vram = max(max_vram, torch.cuda.memory_allocated())
        output = model(train_x_ski)
        loss = -mll(output, train_y_ski)
        loss.backward()
        if gpu:
            max_vram = max(max_vram, torch.cuda.memory_allocated())
        optimizer.step()

    uTime = time.time()-begin
    print(time.time()-begin)
    print("RAM: ",(get_mem() - mem_begin)/(1024**2))
    print("VRAM: ", max_vram / (1024 ** 2))
    
    model.eval()
    with gpytorch.settings.prior_mode():
        output = (model(test_x))
    means = output.mean.cpu()
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    mse_l_ski.append(MSE.item())
    time_l_ski.append(uTime)
    print('Test MAE: {}'.format(torch.mean((means.cpu() - test_y.cpu())*(means.cpu() - test_y.cpu()))))



print(statistics.mean(mse_l_ski))
print(statistics.stdev(mse_l_ski))

print(statistics.mean(time_l_ski))
print(statistics.stdev(time_l_ski))

print(round(statistics.mean(mse_l_ski),5),round(statistics.stdev(mse_l_ski),5), round(statistics.mean(time_l_ski),5), round(statistics.stdev(time_l_ski),5))

# VNN

In [None]:
my_batch_size = 32
smoke_test = False

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)#batch_size=1024
train_loader = DataLoader(train_dataset, batch_size=my_batch_size, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=my_batch_size, shuffle=False)


from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy


class GPModel(ApproximateGP):
    def __init__(self, inducing_points, likelihood, k=256, training_batch_size=256):

        m, d = inducing_points.shape
        self.m = m
        self.k = k
        print(1)

        variational_distribution = gpytorch.variational.MeanFieldVariationalDistribution(m)

        if gpu:
            inducing_points = inducing_points.cuda()
        print(2)

        variational_strategy = NNVariationalStrategy(self, inducing_points, variational_distribution, k=k,
                                                     training_batch_size=training_batch_size)
        print(21)
        super(GPModel, self).__init__(variational_strategy)
        print(22)
        self.mean_module = gpytorch.means.ZeroMean()
        print(23)
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5, ard_num_dims=d))
        print(3)
        
        self.likelihood = likelihood

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        vram_usage()
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

    def __call__(self, x, prior=False, **kwargs):
        if x is not None:
            if x.dim() == 1:
                x = x.unsqueeze(-1)
        return self.variational_strategy(x=x, prior=False, **kwargs)



begin = time.time()
if smoke_test:
    k = 32
    training_batch_size = 32
else:
    k = 256
    training_batch_size = 64

k = 160#320
training_batch_size = 320*4

mse_l_vnn = []
time_l_vnn = []

for i in np.arange(0,n_replicates):
    print("Replicate: ",i)
    mem_begin = get_mem()
    
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    # Note: one should use full training set as inducing points!
    model = GPModel(inducing_points=train_x[::1].contiguous(), likelihood=likelihood, k=k, training_batch_size=training_batch_size)
    
    if gpu:
        likelihood = likelihood.cuda()
        model = model.cuda()
    
    print(time.time()-begin)
    
    #torch.cuda.empty_cache()
    
    num_epochs = 1 if smoke_test else 20
    num_epochs = 10#30
    num_batches = model.variational_strategy._total_training_batches
    
    
    model.train()
    likelihood.train()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    
    # optimizer = torch.optim.Adam([
    #     {'params': model.parameters()},
    #     {'params': likelihood.parameters()},
    # ], lr=0.05)
    
    # Our loss object. We're using the VariationalELBO
    #mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    begin = time.time()
    epochs_iter = tqdm.tqdm(range(num_epochs), desc="Epoch", leave=True, position = 0)
    for epoch in epochs_iter:
        minibatch_iter = tqdm.tqdm(range(num_batches), leave=True, position = 0)
    
        for i in minibatch_iter:
            optimizer.zero_grad()
            output = model(x=None)
            # Obtain the indices for mini-batch data
            current_training_indices = model.variational_strategy.current_training_indices
            # Obtain the y_batch using indices. It is important to keep the same order of train_x and train_y
            y_batch = train_y[...,current_training_indices]
            if gpu:
                y_batch = y_batch.cuda()
            loss = -mll(output, y_batch)
            minibatch_iter.set_postfix(loss=loss.item())
            loss.backward()
            vram_usage()
            optimizer.step()
    uTime = time.time() - begin
    print("Time: ", time.time() - begin)
    print("VRAM: ", max_vram/(1024 ** 2))
    print("RAM: ", (get_mem() - mem_begin)/(1024**2))
    
    model.eval()
    likelihood.eval()
    means = torch.tensor([0.])
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            preds = model(x_batch)
            means = torch.cat([means, preds.mean.cpu()])
    means = means[1:]
    MSE = torch.mean((means - test_y.cpu())*(means - test_y.cpu()))
    #print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))
    mse_l_vnn.append(MSE.item())
    time_l_vnn.append(uTime)

    model = None
    likelihood = None
    mll = None
    optimizer = None
    epochs_iter = None
    if gpu:
        gc.collect()
    
    print('Test MAE: {}'.format(torch.mean((means - test_y.cpu())*(means - test_y.cpu()))))



print(statistics.mean(mse_l_vnn))
print(statistics.stdev(mse_l_vnn))

print(statistics.mean(time_l_vnn))
print(statistics.stdev(time_l_vnn))

print(round(statistics.mean(mse_l_vnn),5),round(statistics.stdev(mse_l_vnn),5), round(statistics.mean(time_l_vnn),5), round(statistics.stdev(time_l_vnn),5))

# Compile Table (MSE and Time only)

SKI
SGPR
LOVE
DKL
SVGP-CI
SVGP
NGD
VNN


In [None]:
print("SKI     --- MSE:",statistics.mean(mse_l_ski), "(",statistics.stdev(mse_l_ski),")  Time:", statistics.mean(time_l_ski), "(",statistics.stdev(time_l_ski),")")
print("SGPR    --- MSE:",statistics.mean(mse_l_sgpr), "(",statistics.stdev(mse_l_sgpr),")  Time:", statistics.mean(time_l_sgpr), "(",statistics.stdev(time_l_sgpr),")")
print("LOVE    --- MSE:",statistics.mean(mse_l_love), "(",statistics.stdev(mse_l_love),")  Time:", statistics.mean(time_l_love), "(",statistics.stdev(time_l_love),")")
print("DKL     --- MSE:",statistics.mean(mse_l_dkl), "(",statistics.stdev(mse_l_dkl),")  Time:", statistics.mean(time_l_dkl), "(",statistics.stdev(time_l_dkl),")")
print("SVGP-CI --- MSE:",statistics.mean(mse_l_svgpci), "(",statistics.stdev(mse_l_svgpci),")  Time:", statistics.mean(time_l_svgpci), "(",statistics.stdev(time_l_svgpci),")")
print("SVGP    --- MSE:",statistics.mean(mse_l_svgp), "(",statistics.stdev(mse_l_svgp),")  Time:", statistics.mean(time_l_svgp), "(",statistics.stdev(time_l_svgp),")")
print("NGD     --- MSE:",statistics.mean(mse_l_ngd), "(",statistics.stdev(mse_l_ngd),")  Time:", statistics.mean(time_l_ngd), "(",statistics.stdev(time_l_ngd),")")
print("VNN     --- MSE:",statistics.mean(mse_l_vnn), "(",statistics.stdev(mse_l_vnn),")  Time:", statistics.mean(time_l_vnn), "(",statistics.stdev(time_l_vnn),")")

Reordering
SVGP
SVGP-CI
VNN
NGD
DKL
SGPR
SKI
LOVE

In [None]:
print("SVGP    --- MSE:",statistics.mean(mse_l_svgp), "(",statistics.stdev(mse_l_svgp),")  Time:", statistics.mean(time_l_svgp), "(",statistics.stdev(time_l_svgp),")")
print("SVGP-CI --- MSE:",statistics.mean(mse_l_svgpci), "(",statistics.stdev(mse_l_svgpci),")  Time:", statistics.mean(time_l_svgpci), "(",statistics.stdev(time_l_svgpci),")")
print("VNN     --- MSE:",statistics.mean(mse_l_vnn), "(",statistics.stdev(mse_l_vnn),")  Time:", statistics.mean(time_l_vnn), "(",statistics.stdev(time_l_vnn),")")
print("NGD     --- MSE:",statistics.mean(mse_l_ngd), "(",statistics.stdev(mse_l_ngd),")  Time:", statistics.mean(time_l_ngd), "(",statistics.stdev(time_l_ngd),")")
print("DKL     --- MSE:",statistics.mean(mse_l_dkl), "(",statistics.stdev(mse_l_dkl),")  Time:", statistics.mean(time_l_dkl), "(",statistics.stdev(time_l_dkl),")")
print("SGPR    --- MSE:",statistics.mean(mse_l_sgpr), "(",statistics.stdev(mse_l_sgpr),")  Time:", statistics.mean(time_l_sgpr), "(",statistics.stdev(time_l_sgpr),")")
print("SKI     --- MSE:",statistics.mean(mse_l_ski), "(",statistics.stdev(mse_l_ski),")  Time:", statistics.mean(time_l_ski), "(",statistics.stdev(time_l_ski),")")
print("LOVE    --- MSE:",statistics.mean(mse_l_love), "(",statistics.stdev(mse_l_love),")  Time:", statistics.mean(time_l_love), "(",statistics.stdev(time_l_love),")")