# Important Note
RAM and VRAM measurements are dependent on the computer state, and should only be interpreted relative to each other. In order to obtain RAM and VRAM measurements, perform the following steps:

1 - Restart the Kernel

2 - Run the "Loading Required Packages and Helper Functions" cell

3 - Run the "Loading Data" cell

4 - Run ONLY ONE iteration of the desired method, and read the RAM and VRAM usage reports printed by the cell

# Loading Required Packages and Helper Functions
If you would like to use Cuda, set gpu = True. Otherwise set gpu = False. 

Step 1: Run the preliminary cells to load the packages and data

Step 2: Create the HSSVD method.

Step 3: Run the execution cells to fit the HSSVD method on the data.

# Step 1:
Loading Packages and setting benchmark parameters.

In [1]:
# Benchmark Parameters
gpu, n_replicates = True, 2


import os, time, gc, statistics, urllib.request
from math import floor
import torch, numpy as np, matplotlib, psutil, gpytorch, pynvml, pandas as pd
from matplotlib import pyplot as plt
from scipy.io import loadmat
from tqdm import trange, tqdm
from torch.utils.data import TensorDataset, DataLoader
from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal


max_vram = 0

def log_memory():
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(0)
    info = pynvml.nvmlDeviceGetMemoryInfo(h)
    gpu_used = info.used / 1024**2
    torch_alloc = torch.cuda.max_memory_allocated() / 1024**2
    sys_used = psutil.virtual_memory().used / 1024**3
    print(f"[GPU] Used: {gpu_used:.2f} MB\n[PyTorch] Max Allocated: {torch_alloc:.2f} MB\n[System RAM] Used: {sys_used:.2f} GB")
    return gpu_used, torch_alloc, sys_used

def get_mem():
    return psutil.Process(os.getpid()).memory_info().rss

def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())


Loading the data (note: must run the DataGenerator.Rmd file first)

In [2]:
import os, numpy as np, pandas as pd, torch

gpu = torch.cuda.is_available()
smoke_test = 'CI' in os.environ

coords_df = pd.read_csv('Data/coordinates.csv')
all_x = torch.tensor(coords_df.values, dtype=torch.float32).contiguous()

expr_df = pd.read_csv('Data/Mbp.csv')
all_y = torch.tensor(expr_df.iloc[:, 0].values, dtype=torch.float32).contiguous()

print("all_x shape:", all_x.shape)
print("all_y shape:", all_y.shape)

import torch, numpy as np

def splitter(x_cpu, y_cpu, n_train=80000, n_test=20000, random_state=42, move_to_gpu=True):
    assert x_cpu.shape[0] == y_cpu.shape[0]
    total = x_cpu.shape[0]
    assert n_train + n_test <= total
    rng = np.random.default_rng(seed=random_state)
    idx = rng.permutation(total)
    t, s = n_train, n_train + n_test
    train_x = x_cpu[idx[:t]].contiguous()
    train_y = y_cpu[idx[:t]].contiguous()
    test_x  = x_cpu[idx[t:s]].contiguous()
    test_y  = y_cpu[idx[t:s]].contiguous()
    if move_to_gpu and torch.cuda.is_available():
        train_x, train_y, test_x, test_y = (t.cuda() for t in (train_x, train_y, test_x, test_y))
    return train_x, train_y, test_x, test_y


all_x shape: torch.Size([393542, 2])
all_y shape: torch.Size([393542])


# Step 2:
Creating the HS-SVD method

In [3]:
import torch
import torch.optim as optim

def phifunc(m, x):
    device = x.device
    dtype = x.dtype
    n = x.shape[0]
    freq = torch.arange(1, m + 1, dtype=dtype, device=device)
    sin1 = torch.sin(torch.pi * freq.unsqueeze(1) * x[:, 0].unsqueeze(0))
    sin2 = torch.sin(torch.pi * freq.unsqueeze(1) * x[:, 1].unsqueeze(0))
    sin1_t = sin1.transpose(0, 1)
    sin2_t = sin2.transpose(0, 1)
    phi = (sin1_t.unsqueeze(2) * sin2_t.unsqueeze(1)).reshape(n, -1)
    return phi


def lambdafunc(m, eps, beta):
    device = eps.device
    dtype = eps.dtype
    freq = torch.arange(1, m + 1, dtype=dtype, device=device)
    I2 = (freq.unsqueeze(1)**2 + freq.unsqueeze(0)**2)
    lam = ((torch.pi**2) * I2 + eps**2)**(-beta)
    lam = lam.reshape(-1)
    lam = lam / torch.max(lam)
    return lam

def hssvd_predict_2d(x, y, x_new, init=torch.tensor([0.0, 0.0]), m=30, beta=1, 
                     train=True, lr=0.01, max_iter=1000, tol=1e-5):
    device = x.device
    dtype = x.dtype
    print(device)
    print(dtype)
    init = init.to(device=device, dtype=dtype)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    n = x.shape[0]
    Phi = phifunc(m, x)
    MM = Phi.shape[1]
    phi_T_phi = Phi.t() @ Phi
    phi_T_y = Phi.t() @ y
    sum_y2 = torch.sum(y**2)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    
    def nlik(log_params):
        eps = torch.exp(log_params[0])
        s2  = torch.exp(log_params[1])
        Lambda = lambdafunc(m, eps, beta)
        A = s2 * torch.diag(1.0 / Lambda) + phi_T_phi + 1e-10 * torch.eye(MM, device=device, dtype=dtype)
        L = torch.linalg.cholesky(A)
        sol = torch.cholesky_solve(phi_T_y.unsqueeze(1), L).squeeze(1)
        t1 = (sum_y2 - (phi_T_y @ sol)) / s2
        logdetA = 2 * torch.sum(torch.log(torch.diag(L)))
        t2 = (n - m) * torch.log(s2) + logdetA
        t3 = torch.sum(torch.log((Lambda)))
        return t1 + t2 + t3
    
    if train:
        print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
        log_params = init.clone().detach().requires_grad_(True)
        optimizer = optim.Adam([log_params], lr=lr)
        prev_loss = None
        for iter_ in range(max_iter):
            optimizer.zero_grad()
            loss = nlik(log_params)
            loss.backward()
            optimizer.step()
            print(f"Iter {iter_}: estimate = {log_params.data}, loss = {loss.item():.4f}", end="\r")
            if prev_loss is not None and torch.abs(loss - prev_loss) < tol*n:
                print(f"\nConverged after {iter_} iterations")
                break
            prev_loss = loss.item()
        out = log_params.detach()
        print("\nOptimized parameters (log-space):", out)
        print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    else:
        out = init
    eps  = torch.exp(out[0])
    s2   = torch.exp(out[1])
    Lambda = lambdafunc(m, eps, beta)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    A_final = s2 * torch.diag(1.0 / Lambda) + phi_T_phi + 1e-10 * torch.eye(MM, device=device, dtype=dtype)
    L_final = torch.linalg.cholesky(A_final)
    sol_final = torch.cholesky_solve(phi_T_y.unsqueeze(1), L_final).squeeze(1)
    K_inv_y = (1.0 / s2) * (y - Phi @ sol_final)
    print(torch.cuda.memory_allocated() / (1024 ** 2), "MB allocated")
    Phi_new = phifunc(m, x_new)
    y_new = Phi_new @ (torch.diag(Lambda) @ (Phi.t() @ K_inv_y))
    log_memory()
    return y_new


# Step 3: Execting the benchmark

In [4]:
import torch
import time
import psutil
import pandas as pd
init=torch.tensor([0.0,0.0],dtype=torch.float32)
m=50
beta=1.5
lr=0.1
num_repeats=n_replicates
results=[]
process=psutil.Process()
for i in range(num_repeats):
    print(f"\n=== Replicate {i+1}/{num_repeats} ===")
    train_x,train_y,test_x,test_y=splitter(all_x,all_y,n_train=300000,n_test=20000,random_state=i, move_to_gpu=gpu)
    if test_y.is_cuda:test_y=test_y.cpu()
    ram_before=process.memory_info().rss/(1024**2)
    start_time=time.time()
    y_pred=hssvd_predict_2d(train_x,train_y,test_x,init=init,m=m,beta=beta,train=True,lr=lr)
    elapsed_time=time.time()-start_time
    if y_pred.is_cuda:y_pred=y_pred.cpu()
    if test_y.is_cuda:test_y=test_y.cpu()
    mse=torch.mean((y_pred-test_y)**2).item()
    ram_after=process.memory_info().rss/(1024**2)
    ram_used=ram_after-ram_before
    results.append((elapsed_time,mse,ram_used))
    print(f"Time = {elapsed_time:.4f}s, MSE = {mse:.6f}, RAM Used = {ram_used:.2f}MB")
df=pd.DataFrame(results,columns=["Time (s)","MSE","RAM Used (MB)"])
display(df)



=== Replicate 1/2 ===
cuda:0
torch.float32
3.5869140625 MB allocated
2897.564453125 MB allocated
2897.564453125 MB allocated
Iter 44: estimate = tensor([ 1.4892, -0.3780], device='cuda:0'), loss = 195257.0000
Converged after 44 iterations

Optimized parameters (log-space): tensor([ 1.4892, -0.3780], device='cuda:0')
2906.00830078125 MB allocated
2906.01904296875 MB allocated
2955.17333984375 MB allocated
[GPU] Used: 14537.05 MB
[PyTorch] Max Allocated: 3169.77 MB
[System RAM] Used: 35.16 GB
Time = 1.5324s, MSE = 0.707767, RAM Used = 316.16MB

=== Replicate 2/2 ===
cuda:0
torch.float32
20.1533203125 MB allocated
2906.16357421875 MB allocated
2906.16357421875 MB allocated
Iter 44: estimate = tensor([ 1.4747, -0.3776], device='cuda:0'), loss = 195444.7812
Converged after 44 iterations

Optimized parameters (log-space): tensor([ 1.4747, -0.3776], device='cuda:0')
2906.166015625 MB allocated
2906.1767578125 MB allocated
2955.015625 MB allocated
[GPU] Used: 14519.54 MB
[PyTorch] Max Allocat

Unnamed: 0,Time (s),MSE,RAM Used (MB)
0,1.532364,0.707767,316.160156
1,0.682261,0.711889,0.183594


In [5]:
import numpy as np

results_array = np.array(results)

time_values = results_array[:, 0]
mse_values = results_array[:, 1]


time_mean = np.mean(time_values)
time_sd = np.std(time_values, ddof=1)

mse_mean = np.mean(mse_values)
mse_sd = np.std(mse_values, ddof=1)

print(f"Time: Mean = {time_mean:.4f}s, SD = {time_sd:.4f}s")
print(f"MSE: Mean = {mse_mean:.6f}, SD = {mse_sd:.6f}")


Time: Mean = 1.1073s, SD = 0.6011s
MSE: Mean = 0.709828, SD = 0.002914
