# Instructions

To run this experiment, first run the Setup chunk, then run the RFF section.

# Setup

In [1]:
gpu = True
n_replicates = 2

%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import time
import math
import gc
import statistics
import urllib.request

import numpy as np
import pandas as pd
from scipy.io import loadmat
import psutil
import torch
from tqdm import tqdm, trange
import gpytorch
import pynvml
from matplotlib import pyplot as plt

from gpytorch.models import ApproximateGP
from gpytorch.variational import (
    NNVariationalStrategy,
    CholeskyVariationalDistribution,
    VariationalStrategy,
)
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal

from torch.utils.data import TensorDataset, DataLoader


def log_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    max_allocated = torch.cuda.max_memory_allocated() / 1024**2  # MB
    max_reserved = torch.cuda.max_memory_reserved() / 1024**2    # MB
    gpu_used = meminfo.used / 1024**2                            # MB
    sys_used = psutil.virtual_memory().used / 1024**3            # GB
    print(f"[PyTorch] Max Allocated: {max_allocated:.2f} MB | Max Reserved: {max_reserved:.2f} MB")
    print(f"[GPU VRAM] Used (nvidia-smi): {gpu_used:.2f} MB | [System RAM]: {sys_used:.2f} GB")
    return max_allocated, max_reserved, gpu_used, sys_used


max_vram = 0
max_ram = 0
def get_mem():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss/(1024**2)

max_vram = 0
def vram_usage():
    global max_vram
    if gpu:
        max_vram = max(max_vram, torch.cuda.memory_allocated())


print("GPU availability: ", torch.cuda.is_available())
print(psutil.virtual_memory().used / (1024 ** 2))


import pandas as pd
import torch

x = pd.read_csv('Data/x_100k.csv', header=None).values.squeeze()
y = pd.read_csv('Data/y_100k.csv', header=None).values.squeeze()
all_x = torch.tensor(x, dtype=torch.float32).unsqueeze(1)
all_y = torch.tensor(y, dtype=torch.float32)
all_x = all_x.contiguous()
all_y = all_y.contiguous()
print("all_x shape:", all_x.shape)
print("all_y shape:", all_y.shape)


def splitter(x_cpu, y_cpu, n_train=80000, n_test=20000, random_state=42, move_to_gpu=True):
    assert x_cpu.shape[0] == y_cpu.shape[0], "Mismatch in number of samples"
    total_samples = x_cpu.shape[0]
    assert n_train + n_test <= total_samples, "Not enough samples to split"
    rng = np.random.default_rng(seed=random_state)
    indices = rng.permutation(total_samples)
    train_idx = indices[:n_train]
    test_idx  = indices[n_train:n_train + n_test]
    train_x = x_cpu[train_idx].contiguous()
    train_y = y_cpu[train_idx].contiguous()
    test_x  = x_cpu[test_idx].contiguous()
    test_y  = y_cpu[test_idx].contiguous()
    if move_to_gpu and torch.cuda.is_available():
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x = test_x.cuda()
        test_y = test_y.cuda()
    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = splitter(all_x, all_y, n_train=80000, n_test=20000)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

GPU availability:  True
28549.08984375
all_x shape: torch.Size([1000000, 1])
all_y shape: torch.Size([1000000])
torch.Size([80000, 1]) torch.Size([80000])
torch.Size([20000, 1]) torch.Size([20000])


# RFF

In [None]:
import tqdm, time, gc
import torch, gpytorch
from memory_profiler import memory_usage

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RFFKernel(
                num_samples=200, # Adjust this line to choose the number of Fourier Features
                num_dims=1
            )
        )

    def forward(self, x):
        return gpytorch.distributions.MultivariateNormal(
            self.mean_module(x),
            self.covar_module(x)
        )

n_replicates = 10
training_iterations = 64
n_train, n_test = 40_000, 10_000
random_state = 42
mse_l_rff, time_l_rff = [], []

for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    train_x, train_y, test_x, test_y = splitter(
        all_x, all_y,
        n_train=n_train, n_test=n_test,
        random_state=random_state + rep,
        move_to_gpu=torch.cuda.is_available()
    )
    ram_before = get_mem() / (1024**2)
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    model.train(); likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.25)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if torch.cuda.is_available():
        mll = mll.cuda()

    def train_fn():
        progress = trange(training_iterations, desc=f"Training (rep {rep+1})", leave=False)
        for _ in progress:
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()
            progress.set_postfix(loss=loss.item())
        return None

    start_time = time.time()
    peak_ram = memory_usage(
        (train_fn,),
        max_usage=True,
        retval=False,
        interval=0.01
    )
    elapsed = time.time() - start_time
    vram_peak = torch.cuda.max_memory_allocated() / (1024**2) if torch.cuda.is_available() else None
    ram_delta = peak_ram - ram_before
    model.eval(); likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu()) ** 2).item()
    mse_l_rff.append(mse)
    time_l_rff.append(elapsed)
    print(
        f"Rep {rep+1}: MSE={mse:.4f}, Time={elapsed:.2f}s, "
        f"RAM before={ram_before:.1f}MB, peak={peak_ram:.1f}MB (Δ={ram_delta:.1f}MB)"
        + (f", VRAM peak={vram_peak:.1f}MB" if vram_peak is not None else "")
    )
    del model, likelihood
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()



=== Replicate 1/10 ===


                                                                             

Rep 1: MSE=0.3005, Time=10.83s, RAM before=0.0MB, peak=790.7MB (Δ=790.7MB), VRAM peak=12689.3MB

=== Replicate 2/10 ===


                                                                             

Rep 2: MSE=0.2941, Time=10.62s, RAM before=0.0MB, peak=935.7MB (Δ=935.7MB), VRAM peak=12814.4MB

=== Replicate 3/10 ===


                                                                             

Rep 3: MSE=0.3066, Time=10.59s, RAM before=0.0MB, peak=936.3MB (Δ=936.3MB), VRAM peak=12814.4MB

=== Replicate 4/10 ===


                                                                             

Rep 4: MSE=0.3028, Time=10.63s, RAM before=0.0MB, peak=936.3MB (Δ=936.3MB), VRAM peak=12814.4MB

=== Replicate 5/10 ===


                                                                             

Rep 5: MSE=0.2973, Time=10.66s, RAM before=0.0MB, peak=936.3MB (Δ=936.3MB), VRAM peak=12814.4MB

=== Replicate 6/10 ===


                                                                             

Rep 6: MSE=0.3048, Time=10.66s, RAM before=0.0MB, peak=936.3MB (Δ=936.3MB), VRAM peak=12814.4MB

=== Replicate 7/10 ===


                                                                             

Rep 7: MSE=0.3002, Time=10.71s, RAM before=0.0MB, peak=936.2MB (Δ=936.2MB), VRAM peak=12814.4MB

=== Replicate 8/10 ===


                                                                            

Rep 8: MSE=0.4594, Time=10.49s, RAM before=0.0MB, peak=936.2MB (Δ=936.2MB), VRAM peak=12814.4MB

=== Replicate 9/10 ===


                                                                             

Rep 9: MSE=0.2972, Time=10.64s, RAM before=0.0MB, peak=936.2MB (Δ=936.2MB), VRAM peak=12814.4MB

=== Replicate 10/10 ===


                                                                             

Rep 10: MSE=0.9770, Time=10.53s, RAM before=0.0MB, peak=936.2MB (Δ=936.2MB), VRAM peak=12814.4MB


In [5]:
print(statistics.mean(mse_l_rff[1:]))
print(statistics.stdev(mse_l_rff[1:]))

print(statistics.mean(time_l_rff[1:]))
print(statistics.stdev(time_l_rff[1:]))


0.393264373143514
0.22516770588452842
10.61574641863505
0.06786572604004579
