# Important Note
RAM and VRAM measurements are dependent on the computer state, and should only be interpreted relative to each other. In order to obtain RAM and VRAM measurements, perform the following steps as normal but only use the first ram and vram measurements. Additionally, the Datagenerator.rmd file must be run to generate the data first.

# Instructions

This notebook provides code to implement and benchmark the HSSVD method using the compact Matern kernel in 1D. To run this benchmark, perform the following steps:

Step 1: Run the following cell to import the required packages and helper functions. Set the number of replicates desired.

Step 2: Load the Data and implement the method

Step 3: Execute the benchmark

# Step 1: Loading Required Packages and Helper Functions

In [1]:
gpu = True
n_replicates = 2

import torch, numpy, matplotlib, psutil, gpytorch, time, pynvml, statistics, gc, numpy as np, tqdm, os, urllib.request, pandas as pd
from matplotlib import pyplot as plt
from math import floor, ceil
from scipy.io import loadmat
from torch.utils.data import TensorDataset, DataLoader

print(torch.__version__)
print(numpy.__version__)
print(matplotlib.__version__)
print(psutil.__version__)
print(gpytorch.__version__)

def log_memory():
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(0)
    m = pynvml.nvmlDeviceGetMemoryInfo(h)
    g = m.used / 1024**2
    t = torch.cuda.max_memory_allocated() / 1024**2
    s = psutil.virtual_memory().used / 1024**3
    print(f"[GPU] Used: {g:.2f} MB")
    print(f"[PyTorch] Max Allocated: {t:.2f} MB")
    print(f"[System RAM] Used: {s:.2f} GB")
    return g, t, s

def get_mem():
    return psutil.Process(os.getpid()).memory_info().rss / 1024**2

max_vram = 0
def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())


2.6.0+cu118
1.26.4
3.9.2
5.9.0
1.13


# Step 2: Load the Data and implement the method

In [2]:
import pandas as pd
import torch

data = pd.read_csv("Data/data_1D_100k_full.csv")
x = torch.tensor(data['x'].values, dtype=torch.float64)
y = torch.tensor(data['y'].values, dtype=torch.float64)
y_true = torch.tensor(data['y_true'].values, dtype=torch.float64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)
y = y.to(device)
y_true = y_true.to(device)

print("Data loaded and transferred to device:", device)


Data loaded and transferred to device: cuda


In [3]:
import pandas as pd
import torch
import numpy as np


data = pd.read_csv("Data/data_1D_100k_full.csv")
x = torch.tensor(data['x'].values, dtype=torch.float64).unsqueeze(1)
y = torch.tensor(data['y'].values, dtype=torch.float64)


def splitter(x_cpu, y_cpu, n_train=80000, n_test=20000, move_to_gpu=True):
    assert x_cpu.shape[0] == y_cpu.shape[0], "Mismatch in number of samples"
    total_samples = x_cpu.shape[0]
    assert n_train + n_test <= total_samples, "Not enough samples to split"
    rng = np.random.default_rng()
    indices = rng.permutation(total_samples)
    train_idx = indices[:n_train]
    test_idx  = indices[n_train:n_train + n_test]
    train_x = x_cpu[train_idx].contiguous()
    train_y = y_cpu[train_idx].contiguous()
    test_x  = x_cpu[test_idx].contiguous()
    test_y  = y_cpu[test_idx].contiguous()
    if move_to_gpu and torch.cuda.is_available():
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x = test_x.cuda()
        test_y = test_y.cuda()
    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = splitter(x, y, n_train=300000, n_test=20000, move_to_gpu=False)
print("Train x shape:", train_x.shape)
print("Train y shape:", train_y.shape)
print("Test x shape:", test_x.shape)
print("Test y shape:", test_y.shape)



Train x shape: torch.Size([300000, 1])
Train y shape: torch.Size([300000])
Test x shape: torch.Size([20000, 1])
Test y shape: torch.Size([20000])


## Implementing the method

In [4]:
import torch
import torch.optim as optim

def phifunc(m, x):
    device = x.device
    dtype = x.dtype
    freq = torch.arange(1, m + 1, dtype=dtype, device=device)
    sin_values = torch.sin(torch.pi * freq.unsqueeze(1) * x.unsqueeze(0))
    phi = sin_values.transpose(0, 1)
    return phi

def lambdafunc(m, alpha, beta):
    device = alpha.device
    dtype = alpha.dtype
    freq = torch.arange(1, m + 1, dtype=dtype, device=device)
    values = (torch.pi**2) * (freq**2) + alpha**2
    lam = values**(-beta)
    lam = lam / torch.max(lam)
    return lam

def hssvd_predict_1d(x, y, x_new, init=torch.tensor([0.0, 0.0]), m=30, beta=1, 
                     train=True, lr=0.01, max_iter=1000, tol=1e-7):
    device = x.device
    dtype = x.dtype
    
    init = init.to(device=device, dtype=dtype)
    n = x.shape[0]
    Phi = phifunc(m, x)
    MM = Phi.shape[1]
    phi_T_phi = Phi.t() @ Phi
    phi_T_y = Phi.t() @ y
    sum_y2 = torch.sum(y**2)

    def nlik(log_params):
        eps = torch.exp(log_params[0])
        s2 = torch.exp(log_params[1])
        Lambda = lambdafunc(m, eps, beta)
        A = s2 * torch.diag(1.0 / Lambda) + phi_T_phi + 1e-10 * torch.eye(MM, device=device, dtype=dtype)
        
        A_cpu = A.cpu()
        L_cpu = torch.linalg.cholesky(A_cpu)
        phi_T_y_cpu = phi_T_y.cpu()
        sol = torch.cholesky_solve(phi_T_y_cpu.unsqueeze(1), L_cpu).squeeze(1).to(device)
        
        t1 = (sum_y2 - (phi_T_y @ sol)) / s2
        logdetA = 2 * torch.sum(torch.log(torch.diag(L_cpu).to(device)))
        t2 = (n - m) * torch.log(s2) + logdetA
        t3 = torch.sum(torch.log(Lambda))
        return t1 + t2 + t3

    if train:
        log_params = init.clone().detach().requires_grad_(True)
        optimizer = optim.Adam([log_params], lr=lr)
        prev_loss = None
        for iter_ in range(max_iter):
            optimizer.zero_grad()
            loss = nlik(log_params)
            loss.backward()
            optimizer.step()
            #print(f"Iter {iter_}: estimate = {log_params.data}, loss = {loss.item():.4f}", end="\r")
            if prev_loss is not None and torch.abs(loss - prev_loss) < tol * 0:
                print(f"\nConverged after {iter_} iterations")
                break
            prev_loss = loss.item()
        out = log_params.detach()
        print("\nOptimized parameters (log-space):", out)
    else:
        out = init

    eps = torch.exp(out[0])
    s2 = torch.exp(out[1])
    Lambda = lambdafunc(m, eps, beta)
    A_final = s2 * torch.diag(1.0 / Lambda) + phi_T_phi + 1e-10 * torch.eye(MM, device=device, dtype=dtype)
    
    A_final_cpu = A_final.cpu()
    L_final_cpu = torch.linalg.cholesky(A_final_cpu)
    phi_T_y_cpu = phi_T_y.cpu()
    sol_final = torch.cholesky_solve(phi_T_y_cpu.unsqueeze(1), L_final_cpu).squeeze(1).to(device)
    
    K_inv_y = (1.0 / s2) * (y - Phi @ sol_final)
    Phi_new = phifunc(m, x_new)
    y_new = Phi_new @ (torch.diag(Lambda) @ (Phi.t() @ K_inv_y))
    
    return y_new


# Step 3: Execute the benchmark
To use GPU, set gpu = True. To use CPU only, set gpu = False.

In [5]:
gpu = False


import torch
import time
import psutil
import pandas as pd
from memory_profiler import memory_usage
torch.set_default_dtype(torch.float64)

device = torch.device("cuda" if gpu and torch.cuda.is_available() else "cpu")

def get_vram_usage():
    return (torch.cuda.max_memory_allocated() / 1024**2
            if torch.cuda.is_available() else None)

def to_device(t):
    return t.double().to(device)


data  = pd.read_csv("Data/data_1D_100k_full.csv")
all_x = to_device(torch.tensor(data['x'].values).unsqueeze(1))
all_y = to_device(torch.tensor(data['y'].values))


train_n, test_n = 4_00_000, 100_000
init_vec        = to_device(torch.tensor([0.0, 0.0]))
m, beta, lr     = 100, 2.0, 1.0#0.1
num_repeats     = 10
proc            = psutil.Process()

def prepare_split():
    tx, ty, vx, vy = splitter(all_x, all_y,
                               n_train=train_n, n_test=test_n,
                               move_to_gpu=gpu)
    return map(to_device, (tx, ty, vx, vy))

def single_run(is_warmup=False):
    tx, ty, vx, vy = prepare_split()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    ram_before = proc.memory_info().rss / (1024**2)
    def target():
        return hssvd_predict_1d(
            tx.squeeze(), ty.squeeze(), vx.squeeze(),
            init=init_vec, m=m, beta=beta, train=True, lr=lr, max_iter=10
        )
    peak_mem, y_pred = memory_usage(
        (target, ),
        max_usage=True,
        retval=True,
        interval=0.01
    )
    ram_delta = peak_mem - ram_before
    vram_before = get_vram_usage()
    vram_peak   = get_vram_usage()
    vram_delta  = (vram_peak - vram_before) if vram_before is not None else None
    start = time.time()
    y_pred = target()
    elapsed = time.time() - start
    if not is_warmup:
        y_pred = y_pred.cpu()
        mse = torch.mean((y_pred - vy.cpu())**2).item()
        return elapsed, mse, ram_before, peak_mem, ram_delta, vram_delta
    else:
        print(f"Warm‑up RAM before={ram_before:.1f} MB, peak={peak_mem:.1f} MB (Δ={ram_delta:.1f} MB)")
        if vram_delta is not None:
            print(f"Warm‑up VRAM Δ={vram_delta:.1f} MB")
        print("Warm‑up done.\n")
print("\n=== WARM‑UP ===")
single_run(is_warmup=True)

columns = ["Time (s)", "MSE", "RAM Before (MB)", "RAM Peak (MB)", "RAM Δ (MB)", "VRAM Δ (MB)"]
results = [single_run() for _ in range(num_repeats)]


df = pd.DataFrame(results, columns=columns)
df.to_csv("hssvd_ram_peak_benchmark.csv", index=False)

print("\n=== Results ===")
print(df)
print("\nSummary:")
print(f"Avg Time: {df['Time (s)'].mean():.4f}s ± {df['Time (s)'].std():.4f}s")
print(f"Avg MSE:  {df['MSE'].mean():.6f} ± {df['MSE'].std():.6f}")
print(f"Avg RAM Δ: {df['RAM Δ (MB)'].mean():.1f} MB ± {df['RAM Δ (MB)'].std():.1f} MB")
if df["VRAM Δ (MB)"].notnull().all():
    print(f"Avg VRAM Δ: {df['VRAM Δ (MB)'].mean():.1f} MB ± {df['VRAM Δ (MB)'].std():.1f} MB")

print("\nResults saved to 'hssvd_ram_peak_benchmark.csv'")



=== WARM‑UP ===

Optimized parameters (log-space): tensor([ 6.2060, -1.3394])

Optimized parameters (log-space): tensor([ 6.2060, -1.3394])
Warm‑up RAM before=600.0 MB, peak=1122.2 MB (Δ=522.2 MB)
Warm‑up VRAM Δ=0.0 MB
Warm‑up done.


Optimized parameters (log-space): tensor([ 6.2059, -1.3429])

Optimized parameters (log-space): tensor([ 6.2059, -1.3429])

Optimized parameters (log-space): tensor([ 6.2064, -1.3410])

Optimized parameters (log-space): tensor([ 6.2064, -1.3410])

Optimized parameters (log-space): tensor([ 6.2051, -1.3402])

Optimized parameters (log-space): tensor([ 6.2051, -1.3402])

Optimized parameters (log-space): tensor([ 6.2064, -1.3362])

Optimized parameters (log-space): tensor([ 6.2064, -1.3362])

Optimized parameters (log-space): tensor([ 6.2051, -1.3432])

Optimized parameters (log-space): tensor([ 6.2051, -1.3432])

Optimized parameters (log-space): tensor([ 6.2060, -1.3382])

Optimized parameters (log-space): tensor([ 6.2060, -1.3382])

Optimized parameters

In [6]:
import numpy as np
results_array = np.array(results)

time_values = results_array[:, 0]
mse_values = results_array[:, 1]


time_mean = np.mean(time_values)
time_sd = np.std(time_values, ddof=1)

mse_mean = np.mean(mse_values)
mse_sd = np.std(mse_values, ddof=1)

print(f"Time: Mean = {time_mean:.4f}s, SD = {time_sd:.4f}s")
print(f"MSE: Mean = {mse_mean:.6f}, SD = {mse_sd:.6f}")


Time: Mean = 0.0733s, SD = 0.0061s
MSE: Mean = 0.299603, SD = 0.001278


In [7]:
mse_values

array([0.29907616, 0.29867476, 0.29755316, 0.29863851, 0.30143735,
       0.29959526, 0.29946414, 0.2996606 , 0.30177706, 0.30015483])