# 

# Instructions

To run this experiment, first run the Setup chunk, then run the RFF section.

In [1]:
gpu = True
import os
import gc
import time
import math
import urllib.request
import statistics
import numpy as np
import pandas as pd
import torch
import gpytorch
import psutil
import tqdm
import faiss
from math import floor
from matplotlib import pyplot as plt
from scipy.io import loadmat
from torch.utils.data import DataLoader, TensorDataset
from memory_profiler import memory_usage
import pynvml


from gpytorch.models import ApproximateGP
from gpytorch.variational.nearest_neighbor_variational_strategy import NNVariationalStrategy
from gpytorch.models.deep_gps import DeepGPLayer, DeepGP
from gpytorch.mlls import DeepApproximateMLL
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel, InducingPointKernel
from gpytorch.distributions import MultivariateNormal


def clear_gpu():
    for obj in ['model', 'likelihood', 'observed_pred', 'preds', 'output']:
        if obj in globals():
            del globals()[obj]
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

def get_mem():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 2)  # Return MB

max_vram = 0
def vram_usage():
    global max_vram
    max_vram = max(max_vram, torch.cuda.memory_allocated())

def log_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    max_allocated = torch.cuda.max_memory_allocated() / 1024**2  # MB
    max_reserved = torch.cuda.max_memory_reserved() / 1024**2    # MB
    gpu_used = meminfo.used / 1024**2                            # MB
    sys_used = psutil.virtual_memory().used / 1024**3            # GB
    print(f"[PyTorch] Max Allocated: {max_allocated:.2f} MB | Max Reserved: {max_reserved:.2f} MB")
    print(f"[GPU VRAM] Used (nvidia-smi): {gpu_used:.2f} MB | [System RAM]: {sys_used:.2f} GB")
    return max_allocated, max_reserved, gpu_used, sys_used


try:
    get_ipython().run_line_magic('matplotlib', 'inline')
    get_ipython().run_line_magic('load_ext', 'autoreload')
    get_ipython().run_line_magic('autoreload', '2')
except:
    pass

csvfile = pd.read_csv('Data/data_2d.csv', header = None, dtype=float, delimiter=',')
all_data = torch.tensor(np.array(csvfile)).float()

def splitter(all_data, n_train=80_000, n_test=20_000, random_state=42, move_to_gpu=True):
    assert all_data.ndim == 2 and all_data.shape[1] == 3, \
        "all_data must be [N,3]"
    total_samples = all_data.shape[0]
    assert n_train + n_test <= total_samples, "Not enough samples to split"
    rng = np.random.default_rng(seed=random_state)
    indices = rng.permutation(total_samples)
    train_idx = indices[:n_train]
    test_idx  = indices[n_train:n_train + n_test]
    train = all_data[train_idx]
    test  = all_data[test_idx]

    train_x = train[:, :2].contiguous()
    train_y = train[:,  2].contiguous()
    test_x  = test[:,  :2].contiguous()
    test_y  = test[:,   2].contiguous()

    if move_to_gpu and torch.cuda.is_available():
        train_x = train_x.cuda()
        train_y = train_y.cuda()
        test_x  = test_x.cuda()
        test_y  = test_y.cuda()

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = splitter(all_data, n_train= 80_000, n_test = 20_000, random_state=42, move_to_gpu=True)
print(train_x.shape)
print(train_x.size(-1))
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)


torch.Size([80000, 2])
2
torch.Size([80000])
torch.Size([20000, 2])
torch.Size([20000])


# RFF

In [None]:
import tqdm, time, gc
import torch, gpytorch
from memory_profiler import memory_usage

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RFFKernel(
                num_samples=200,  # Adjust this line to choose the number of Fourier Features
                num_dims=2
            )
        )

    def forward(self, x):
        return gpytorch.distributions.MultivariateNormal(
            self.mean_module(x),
            self.covar_module(x)
        )

n_replicates = 11
training_iterations = 64
n_train, n_test = 40_000, 20_000
random_state = 42
mse_l_rff, time_l_rff = [], []

for rep in range(n_replicates):
    print(f"\n=== Replicate {rep + 1}/{n_replicates} ===")
    train_x, train_y, test_x, test_y = splitter(
        all_data, n_train=n_train, n_test=n_test,
        random_state=42 + rep, move_to_gpu=gpu
    )
    ram_before = get_mem() / (1024**2)
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = GPRegressionModel(train_x, train_y, likelihood)
    model.train(); likelihood.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.25)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    if torch.cuda.is_available():
        mll = mll.cuda()

    def train_fn():
        progress = tqdm.trange(training_iterations, desc=f"Training (rep {rep+1})", leave=False)
        for _ in progress:
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()
            progress.set_postfix(loss=loss.item())
        return None

    start_time = time.time()
    peak_ram = memory_usage(
        (train_fn,),
        max_usage=True,
        retval=False,
        interval=0.01
    )
    elapsed = time.time() - start_time
    vram_peak = torch.cuda.max_memory_allocated() / (1024**2) if torch.cuda.is_available() else None
    ram_delta = peak_ram - ram_before
    model.eval(); likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        pred = likelihood(model(test_x)).mean.cpu()
    mse = torch.mean((pred - test_y.cpu()) ** 2).item()
    mse_l_rff.append(mse)
    time_l_rff.append(elapsed)
    print(
        f"Rep {rep+1}: MSE={mse:.4f}, Time={elapsed:.2f}s, "
        f"RAM before={ram_before:.1f}MB, peak={peak_ram:.1f}MB (Δ={ram_delta:.1f}MB)"
        + (f", VRAM peak={vram_peak:.1f}MB" if vram_peak is not None else "")
    )
    del model, likelihood
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()



=== Replicate 1/11 ===


                                                                             

Rep 1: MSE=0.1001, Time=10.69s, RAM before=0.0MB, peak=828.4MB (Δ=828.4MB), VRAM peak=12689.6MB

=== Replicate 2/11 ===


                                                                             

Rep 2: MSE=0.1009, Time=10.50s, RAM before=0.0MB, peak=978.1MB (Δ=978.1MB), VRAM peak=12814.9MB

=== Replicate 3/11 ===


                                                                             

Rep 3: MSE=0.1017, Time=10.50s, RAM before=0.0MB, peak=978.3MB (Δ=978.3MB), VRAM peak=12814.9MB

=== Replicate 4/11 ===


                                                                             

KeyboardInterrupt: 

In [5]:
print(statistics.mean(mse_l_rff[1:]))
print(statistics.stdev(mse_l_rff[1:]))

print(statistics.mean(time_l_rff[1:]))
print(statistics.stdev(time_l_rff[1:]))


0.10186869353055954
0.000967189484150289
10.747753953933715
0.12633148455706028
