# Gaussian Process Surrogate Modeling for Molecular Dynamics Optimization

## Imports and Functions

In [1]:
# Math Packages
import numpy as np
import torch as torch
from scipy import interpolate
from scipy.optimize import minimize
import time as time
import pandas as pd

# Plotting
import matplotlib.pyplot as plt  
from matplotlib.pyplot import figure

# Data saving packages
from pickle import dump, load

# Parallelization (AS IF WE WOULD DO THAT LUL)
from multiprocessing import Pool
import multiprocessing as mp
import os
mp.set_start_method('fork')
os.environ["OMP_NUM_THREADS"] = "1" 

In [2]:
def se_kernel(x1, x2, l, width):
    """
    Computes the squared exponential kernel between the tensors x and y with hyper-parameters l and width.
    N corresponds to the number of samples and D corresponds to the number of dimensions of the input function.
    
    Parameters
    ----------
    
    x: Tensor [N,D]
        Feature vector for N samples with D dimensions each
    
    y: Tensor [N,D]
        Feature vector for N samples with D dimensions each
        
    l: Tensor [D]
        Lengthscale hyper parameter
        
    width: Float
        Width hyper parameter
    """
    K = width**2 * torch.exp(-(torch.cdist(x1/l,x2/l,p=2)**2)/2)
    return K

def surrogate(Xi, Xd, l, width, y, KddInv):
    """
    Computes the gaussian process estimate of the structure factor given a set of pair potential parameters. 
    
    Parameters
    ----------
    
    Xi: Tensor [η*M,D]
        Feature vector for M potential samples at η r evaluations with D dimensions each. This 
        corresponds to the points you wish to infer.
    
    Xd: Tensor [N,D]
        Feature vector for N samples with D dimensions each. This corresponds to the points you trained on.
        
    l: Tensor [D]
        Lengthscale hyper parameter.
        
    width: Float
        Width hyper parameter
        
    y: Tensor [N,1]
        Output feature vector corresponding to the Xd training set.  
        
    KddInv: Tensor [N,N]
        This is the inverted kernel matrix of the training set Xd
       
    Returns
    -------
       
    μ: Tensor [η,N]
        The mean estimate for each S(q,n,sigma,epsilon) given in Xi. 
        
    """
    Kid = se_kernel(Xi, Xd, l, width)
    return 1 +  (Kid @ KddInv @ (y-1))


def subset_surrogate(Xi, Xd, l, width, y, KddInv):
    """
    Computes the subset gaussian process estimate of the structure factor given a set of pair potential parameters. 
    
    Parameters
    ----------
    
    Xi: Tensor [M,D]
        Feature vector for M samples with D dimensions each. This corresponds to the points you wish to infer.
    
    Xd: Tensor [M,D]
        Feature vector for N samples with D dimensions each. This corresponds to the points you trained on.
        
    l: Tensor [D]
        Lengthscale hyper parameter.
        
    width: Float
        Width hyper parameter
        
    KddInv: Tensor [N,N]
        This is the inverted kernel matrix of the training set Xd
        
    Returns
    -------
    
    μ: Tensor [η,N]
        The mean estimate for each S(q) at each potential. The S(q)'s are organized in vertical lines where the column
        dimension indexes the potential parameters. 
        
    """
    Kid = se_kernel(Xi, Xd, l, width)
    return 1 + ((Kid @ KddInv) @ (y-1)).T

## Importing the training data

In [3]:
# Import the training data generated by 1_sample_gen.ipynb

rmin = 0.0118331810091873
rmax = 15.512161254882812
rnum = 73

input_dict = load(open('training_data/samples.p', 'rb'))
xd = input_dict['xs']

input_dict = load(open('training_data/training_rdf.p', 'rb'))
r_raw = input_dict['r']
model_rdf_raw = input_dict['model_rdf']

model_rdf = torch.zeros(len(model_rdf_raw),rnum)
r = np.linspace(rmin,rmax,rnum)

# print("Old length: ", len(r_raw))
# print("New length: ", len(r))

# We can interpolate this data to have the same .
for i in range(len(model_rdf_raw)):
    rdf_i = interpolate.splrep(r_raw, model_rdf_raw[i], s=0)
    model_rdf[i] = torch.from_numpy(interpolate.splev(r, rdf_i, der=0))
     
#model_rdf_μ = torch.sum(model_rdf,dim=0)/len(model_rdf)

#plt.plot(r,model_rdf_μ)
#plt.show()

print(rmax-0.02)
print(np.pi/15)

15.492161254882813
0.20943951023931953


In [4]:
experimentalCSVFilename = 'exp_data/ne_42K_rdf_new.csv'

data = pd.read_csv(experimentalCSVFilename)
r_exp_raw = np.array(data['r'])
rdf_exp_raw = np.array(data[' g'])

rmin = 0.0118331810091873
rmax = 15.512161254882812
rnum = 73

# Interpolalate the experimental data to make it consistent with the simulations
r  = torch.tensor(np.linspace(rmin, rmax, num=rnum))
rdf_exp_i = interpolate.splrep(r_exp_raw, rdf_exp_raw, s=0)
rdf_exp = torch.from_numpy(interpolate.splev(r, rdf_exp_i, der=0))

# print("Old length: ", len(r_exp_raw))
# print("New length: ", len(r))

# figure(figsize = (12,10),dpi=80)
# plt.title("Experimental")
# plt.scatter(r_exp_raw,rdf_exp_raw,alpha=0.4)
# plt.plot(r, rdf_exp)
# plt.xlim(rmin,rmax)
# plt.xlabel("$\AA$")
# plt.show()

In [5]:
# Visualization of training set

# figure(figsize = (12,10),dpi=80)
# plt.title("GP Training Set")
# for i in range(len(model_rdf_raw)):
#     plt.plot(r,model_rdf[i],alpha=0.4)
# plt.xlim(rmin,rmax)
# plt.xlabel("$\AA^{-1}$")
# plt.show()

# fig, axs = plt.subplots(2, 2, figsize=(14, 8))
# plt.suptitle('Potential Parameter Distributions')
# axs[0,0].scatter(xd[:, 0], xd[:, 1],label="Samples")
# axs[0,0].set_xlabel('n')
# axs[0,0].set_ylabel('σ')
# axs[0,1].scatter(xd[:, 0], xd[:, 2],label="Samples")
# axs[0,1].set_xlabel('n')
# axs[0,1].set_ylabel('ϵ')
# axs[1,0].scatter(xd[:, 1], xd[:, 2],label="Samples")
# axs[1,0].set_xlabel('σ')
# axs[1,0].set_ylabel('ϵ')
# plt.show()

## Basic Matricies for GP Calculations

In [6]:
n = len(xd)
η = len(r)
XdClassic = torch.zeros(n*η,4)
yClassic = torch.zeros(n*η)

k = 0 # Row index in Xd matrix and y vector.
for i in range(n):
    for j in range(η):
        # Xd_k = (n,σ,ϵ,q)
        XdClassic[k] = torch.tensor([xd[i][0],xd[i][1],xd[i][2],r[j]])
        yClassic[k] = model_rdf[i][j]
        k += 1
        
yClassic = torch.unsqueeze(yClassic,dim=0).transpose(0,1)

# Remake the Xd matrix for the subset matrix 
XdSubset = torch.tensor(xd).float()
ySubset = model_rdf.float()

indexSubset = torch.arange(0,len(xd),1)

## Choosing the hyper parameters based off LOO and LMLH

In [7]:
def neg_log_marginal_LH_subset(arr, Xd, y):
    """
    Computes the log marginal likelihood of one gaussian process in a subset of gaussian 
    processes for a set of hyper parameters.
    
    Parameters
    ----------
    
    arr: Array 
        An array of the hyper parameters to compute the model evidence at. 
        
    y: Tensor [N,η]
        Output feature vector corresponding to the Xd training set.  
        
    Xd: Tensor [M,D]
        Feature vector for N samples with D dimensions each. This corresponds to the points you trained on.
       
    Returns
    -------
       
    μ: Float 
        The log marginal LH for the set of hyper parameters
        x
    """
    l = torch.tensor([arr[0],arr[1],arr[2]]).float()
    w = torch.tensor(arr[3]).float()
    σn = torch.tensor(arr[4]).float()
    Kdd = w**2 * torch.exp(-(torch.cdist(Xd/l,Xd/l,p=2)**2)/2) + σn*torch.eye(len(Xd))
    sign, AbsKddLogDet = torch.slogdet(Kdd)
    KddInv = torch.linalg.inv(Kdd)
    out = (0.5 * (y - 1).T @ KddInv @ (y - 1)  + 0.5 * AbsKddLogDet + 0.5*len(y)*np.log(2*np.pi))
    return out.item()

In [8]:
try:
    print("Attempting to load previously calculated hyper parameters")
    
    from pickle import load
    input_dict = load(open('training_data/hyperParamTrainingSubsetRDF.p', 'rb'))
    μArrSubset = input_dict['μArrSubset']
    logMarginalLHArrSubset = input_dict['logMarginalLHArrSubset']
    hyperParamOptionsSubset = input_dict['hyperParamOptionsSubset']

    print("Success!!!")
    
except:
    
    trials = 1_000
    hyperParamOptionsSubset = torch.zeros((trials,5))
    
    hyperParamOptionsSubset[:,0] = (4 - 0.5) * torch.rand(trials) + 0.5
    hyperParamOptionsSubset[:,1] = (1.3 - 0.01) * torch.rand(trials) + 0.01
    hyperParamOptionsSubset[:,2] = (0.1 - 0.01) * torch.rand(trials) + 0.01
    hyperParamOptionsSubset[:,3] = (0.08 - 0.0001) * torch.rand(trials) + 0.0001
    hyperParamOptionsSubset[:,4] = (0.01 - 0.00000001) * torch.rand(trials) + 0.00000001
    
    μArrSubset = torch.zeros((len(hyperParamOptionsSubset),480,len(rdf_exp)))
    logMarginalLHArrSubset = torch.zeros((len(hyperParamOptionsSubset),len(rdf_exp)))
    
    for j in range(len(hyperParamOptionsSubset)):
        
        if j % 50 == 0:
            print("Starting Iteration:", j)

        # Calculate Kdd for Subset GP with hyper parameter index j
        arrSubset = hyperParamOptionsSubset[j]
        lSubset = torch.tensor([arrSubset[0],arrSubset[1],arrSubset[2]]).float()
        wSubset = torch.tensor(arrSubset[3]).float()
        σnSubset = torch.tensor(arrSubset[4]).float()
        KddSubset = se_kernel(XdSubset,XdSubset,lSubset,wSubset) + torch.eye(len(XdSubset))*σnSubset

        μArrSubset_j = torch.zeros(480,len(rdf_exp))
        
        for i in range(len(xd)): 

            KddSubset_i = KddSubset[indexSubset[indexSubset != i]].T[indexSubset[indexSubset != i]].T
            KddInvSubset_i = torch.linalg.inv(KddSubset_i)

            # Remove the same values from y
            ySubset_i = ySubset[indexSubset != i]

            # Again for X data
            XdSubset_i = XdSubset[indexSubset[indexSubset != i]]
            XiSubset = XdSubset[i].unsqueeze(dim=0)

            # Compute the predictions after leaving one out
            μSubset = subset_surrogate(XiSubset,XdSubset_i,lSubset,wSubset,ySubset_i,KddInvSubset_i)
            μArrSubset_j[i] = μSubset.T
        
        logMarginalLHArrSubset_j = torch.zeros(len(rdf_exp))
        
        for k in range(η):
            logMarginalLHArrSubset_j[k] = - neg_log_marginal_LH_subset(arrSubset,XdSubset,torch.unsqueeze(ySubset.T[k],dim=0).T)
            
        logMarginalLHArrSubset[j] = logMarginalLHArrSubset_j
        μArrSubset[j] = μArrSubset_j
        
    from pickle import dump
    output_dict = dict(μArrSubset = μArrSubset, hyperParamOptionsSubset = hyperParamOptionsSubset, logMarginalLHArrSubset = logMarginalLHArrSubset)
    dump(output_dict, open('training_data/hyperParamTrainingSubsetRDF.p', 'wb'))

Attempting to load previously calculated hyper parameters
Success!!!


In [9]:
# Compute the leave one out error for each parameter
LooErr = torch.zeros(1000)
for i in range(len(hyperParamOptionsSubset[:1000])):
    LooErr[i] = torch.sum((μArrSubset[i] - ySubset)**2)
# Grab the one with the minimum error 
LooIndex = torch.argmin(LooErr)

print("Hyper parameters corresponding to the minimum leave one out error: ", hyperParamOptionsSubset[LooIndex])
print("Average error per training example per point corresponding to the minimum leave one out error: ", LooErr[LooIndex].item()/len(model_rdf)/len(r))
print("LMLH of minimum LOO error:",torch.sum(logMarginalLHArrSubset,dim=1)[LooIndex].item())
print()
# Sum the log marginal likelihood contributions over each GP.
# Grab the one with the largest sum, aka the largest probability
LMLHIndex = torch.argmax(torch.sum(logMarginalLHArrSubset,dim=1))
print("Hyper parameters corresponding to the maximum LMLH: ", hyperParamOptionsSubset[LMLHIndex])
print("Maxium LMLH sum: ", torch.sum(logMarginalLHArrSubset,dim=1)[LMLHIndex].item())
print("Leave one out error corresponding to the LMLH: ", LooErr[LMLHIndex].item()/len(model_rdf)/len(r))

Hyper parameters corresponding to the minimum leave one out error:  tensor([3.3331e+00, 1.1413e-01, 7.7208e-02, 5.7238e-02, 2.1267e-04])
Average error per training example per point corresponding to the minimum leave one out error:  0.2325908364770619
LMLH of minimum LOO error: -3269756.0

Hyper parameters corresponding to the maximum LMLH:  tensor([3.6515, 0.2952, 0.0621, 0.0799, 0.0095])
Maxium LMLH sum:  -894538.875
Leave one out error corresponding to the LMLH:  0.5639863593393264


In [10]:
# Define these variables for later use when timing and validating
lSubset = torch.tensor([3.3331e+00, 1.1413e-01, 7.7208e-02])
wSubset =  5.7238e-02
σnSubset = 2.1267e-04

## Timing Code

In [11]:
try:
    input_dict = load(open('training_data/ClassicGPTimesRDF', 'rb'))
    evaluationTimesClassic = input_dict['evaluationTimesClassic']
    inversionTimesClassic = input_dict['inversionTimesClassic']

    print("Loaded previously computed times..")
    
    print("Average inversion time for classic GP:", np.mean(inversionTimesClassic))
    print()

    print("Average evaluation time for subset GP:", np.mean(evaluationTimesClassic))
    print()
    
# Get time taken for regualar GP
except:
    evaluationTimesClassic = [] 
    inversionTimesClassic = []
    N_trialsClassic = 20
    
    print("Timing Kdd inversion...")
    
    for n in range(N_trialsClassic):
        
        t1 = time.time()
        
        # No need to apply good hps here, we only care about the time
        KddClassic = se_kernel(XdClassic,XdClassic,torch.ones(len(XdClassic[0])),1) + 2*torch.eye(len(XdClassic))
        KddInvClassic = torch.linalg.inv(KddClassic)
        
        t2 = time.time()
        
        inversionTimesClassic.append(t2-t1)
    print("Average inversion time for classic GP:", np.mean(inversionTimesClassic))
    print()
    
    print("Timing surrogate evaluation...")

    Xi = XdClassic[:len(r)]
    
    for n in range(N_trialsClassic):
        
        t1 = time.time()
        
        μ = surrogate(Xi,XdClassic,torch.ones(len(XdClassic[0])),1,yClassic,KddInvClassic)
        
        t2 = time.time()
        
        evaluationTimesClassic.append(t2-t1)
        
    print("Average evaluation time for classic GP:", np.mean(evaluationTimesClassic))
    print()

    from pickle import dump
    output_dict = dict(evaluationTimesClassic = evaluationTimesClassic, inversionTimesClassic = inversionTimesClassic)
    dump(output_dict, open('training_data/ClassicGPTimesRDF', 'wb'))

Loaded previously computed times..
Average inversion time for classic GP: 87.47960150241852

Average evaluation time for subset GP: 1.0622711539268495



In [12]:
try:
    input_dict = load(open('training_data/SubsetGPTimesRDF', 'rb'))
    evaluationTimesSubset = input_dict['evaluationTimesSubset']
    inversionTimesSubset = input_dict['inversionTimesSubset']

    print("Loaded previously computed times..")
    print()
    
    print("Average inversion time for subset GP:", np.mean(inversionTimesSubset))
    print()

    print("Average evaluation time for subset GP:", np.mean(evaluationTimesSubset))
    print()

# Get time taken for subset GP
except:
    
    evaluationTimesSubset = [] 
    inversionTimesSubset = []
    
    N_trials = 1000
    
    print("Timing Kdd inversion...")
    for n in range(N_trials):
        
        t1 = time.time()

        KddSubset = se_kernel(XdSubset,XdSubset,lSubset,wSubset) + σnSubset*torch.eye(len(XdSubset))
        KddInvSubset = torch.linalg.inv(KddSubset)
        
        t2 = time.time()
        
        inversionTimesSubset.append(t2-t1)
    print("Average inversion time for subset GP:", np.mean(inversionTimesSubset))
    print()

    XiSubset = torch.tensor([[XdSubset[0][0],XdSubset[0][1],XdSubset[0][2]]])
    
    print("Timing surrogate evaluation...")
    for n in range(N_trials):
        
        t1 = time.time()
        
        μ = subset_surrogate(XiSubset,XdSubset,lSubset,wSubset,ySubset,KddInvSubset)
        
        t2 = time.time()
        
        evaluationTimesSubset.append(t2-t1)
        
    
    print("Average evaluation time for subset GP:", np.mean(evaluationTimesSubset))
    print()

    from pickle import dump
    output_dict = dict(evaluationTimesSubset = evaluationTimesSubset, inversionTimesSubset = inversionTimesSubset)
    dump(output_dict, open('training_data/SubsetGPTimesRDF', 'wb'))

Loaded previously computed times..

Average inversion time for subset GP: 0.02091193389892578

Average evaluation time for subset GP: 0.00029948973655700685



In [13]:
numMCMCSamps = 12_000
numGridSamps = 50**4
# Note these assume no overhead. 
print("Expected time for model calls during MCMC with subset: " + str(np.round(numMCMCSamps*np.mean(evaluationTimesSubset)/60,4)) +  " Mins")
print("Expected time for model calls during MCMC with regular gp: " + str(np.round(numMCMCSamps*np.mean(evaluationTimesClassic)/60,4)) +  " Mins")
print()
print("Expected time for model calls for grid with subset: " + str(np.round(numGridSamps*np.mean(evaluationTimesSubset)/60/60,4)) +  " Hours")
print("Expected time for model calls for grid with regular gp: " + str(np.round(numGridSamps*np.mean(evaluationTimesClassic)/60/60,4)) + " Hours")
print()
print("Evaluation Speed up Classic vs Subset:", np.mean(evaluationTimesClassic)/np.mean(evaluationTimesSubset))
print("Inversion Speed up Classic vs Subset:", np.mean(inversionTimesClassic)/np.mean(inversionTimesSubset))
print("Evaluation Speed up Classic vs Sim:", 86.2483875498 /np.mean(evaluationTimesClassic))
print("Evaluation Speed up Subset vs Sim:", 86.2483875498 /np.mean(evaluationTimesSubset))
print()

Expected time for model calls during MCMC with subset: 0.0599 Mins
Expected time for model calls during MCMC with regular gp: 212.4542 Mins

Expected time for model calls for grid with subset: 0.5199 Hours
Expected time for model calls for grid with regular gp: 1844.2208 Hours

Evaluation Speed up Classic vs Subset: 3546.9367536227733
Inversion Speed up Classic vs Subset: 4183.238237325923
Evaluation Speed up Classic vs Sim: 81.19244058446802
Evaluation Speed up Subset vs Sim: 287984.4516253829

