## An optimization test bed.

First, I'll define the usual functions.

In [1]:
def is_pos_def(X):
    '''Check whether a matrix X is pos definite.
    Returns True or False, depending on X.
    '''
    return np.all(np.linalg.eigvals(X) > 0)

def batch_data(data, batch_size):
    n = data.shape[0]
    p = data.shape[1]
    if n % batch_size != 0:
        n = (n // batch_size) * batch_size
    ind = np.arange(n)
    np.random.shuffle(ind)
    n_batches = n // batch_size
    data = data[ind].reshape(batch_size, p, n_batches)
    return(data, n_batches)

def sghmc(gradU, eta, niter, alpha, theta_0, V_hat, dat, batch_size):
    '''Define SGHMC as described in the paper
    Tianqi Chen, Emily B. Fox, Carlos Guestrin 
    Stochastic Gradient Hamiltonian Monte Carlo 
    ICML 2014.

    The inputs are:
    gradU = gradient of U
    eta = eps^2 M^(-1)
    niter = number of samples to generate
    alpha = eps M^(-1) C
    theta_0 = initial val of parameter(s) to be sampled
    V_hat = estimated covariance matrix of stoch grad noise
    See paper for more details

    The return is:
    A np.array of positions of theta.'''

    ### Initialization and checks ###
    # get dimension of the thing you're sampling
    p = len(theta_0)
    # set up matrix of 0s to hold samples
    n = dat.shape[0]
    theta_samps = np.zeros((p, niter*(n // batch_size)))
    # fix beta_hat as described on pg. 6 of paper
    beta_hat = 0.5 * V_hat @ eta
    # We're sampling from a N(0, 2(alpha - beta_hat) @ eta)
    # so this must be a positive definite matrix
    Sigma = 2 * (alpha - beta_hat) @ eta
    if not is_pos_def( Sigma ): 
        print("Error: (alpha - beta_hat) eta not pos def")
        return
    # Need batch size to be <= the amount of data
    if (batch_size > dat.shape[0]): 
        print("Error: batch_size must be <= number of data points")
        return

    # initialize nu and theta 
    nu = np.random.multivariate_normal(np.zeros(p), eta).reshape(p,-1)
    theta = theta_0
    
    # loop through algorithm to get niter samples
    it = 0
    for i in range(niter):
        dat_resh, nbatches = batch_data(dat, batch_size)
        
        # Resample momentum every epoch
        nu = np.random.multivariate_normal(np.zeros(p), eta).reshape(p,-1)
        
        for batch in range(nbatches):
            gradU_batch = gradU(theta, dat_resh[:,:,batch], n, batch_size).reshape(p,-1)
            nu = nu - eta @ gradU_batch - alpha @ nu + \
                 np.random.multivariate_normal(np.zeros(p), Sigma).reshape(p, -1)
            theta = theta + nu
            theta_samps[:,it] = theta.reshape(-1,p)
            it = it + 1
        
    return theta_samps

Now import some optimization things.

In [2]:
import numba
from numba import jit
import time

For now, just look at optimization for the mixture of normals problem. I don't care about performance of the sampler itself right now, so I turn down the size of n and niter just to make it go relatively quickly in my tests.

In [3]:
import matplotlib.pyplot as plt
import autograd.numpy as np
from autograd import jacobian
import seaborn as sns

## Example #1:
## Sampling from a mixture of normals in 1-D
## SAMPLING MODEL: x ~ 0.5 * N(mu1, 1) + 0.5 * N(mu2, 1)
## PRIORS: p(mu1) = p(mu2) = N(0,10)

def log_prior(theta):
    return(-(1/(2*10))*theta.T@theta)
      
def log_lik(theta, x):
    return(np.log(0.5 * np.exp(-0.5*(theta[0]-x)**2) + 0.5* np.exp(-0.5*(theta[1]-x)**2)))

def U(theta, x, n, batch_size):
    return(-log_prior(theta) - (n/batch_size)*sum(log_lik(theta, x)))
       
# Automatic differentiation to get the gradient
gradU = jacobian(U, argnum=0)

# Set random seed
np.random.seed(1234)
# Set up the data
p = 2 #dimension of theta
theta = np.array([-3.0, 3.0]).reshape(p,-1)
n = 100 # smaller for test
x = np.array([np.random.normal(theta[0], 1, (n,1)),
              np.random.normal(theta[1], 1, (n,1))]).reshape(-1,1)

## Initialize parameters and sample 

# Initialize mean parameters
#theta_0 = np.random.normal(size=(p,1))
theta_0 = theta # initialize at "true" value for testing

# Initialize tuning parameters:
# learning rate
eta = 0.01/n * np.eye(p)
# Friction rate
alpha = 0.1 * np.eye(p)

# Arbitrary guess at covariance of noise from mini-batching the data
V = np.eye(p)*1
niter = 50
batch_size=20 # make this smallish

# Don't actually run sampling algorithm here
# samps = sghmc(gradU, eta, niter, alpha, theta_0, V, x, batch_size)

Try cleaning up the multivariate normal sampling. Pre-calculate the Cholesky decompositions, and use the Cholesky-based sampling method.

In [44]:
def sghmc_cleaned(gradU, eta, niter, alpha, theta_0, V_hat, dat, batch_size):
    '''Define SGHMC as described in the paper
    Tianqi Chen, Emily B. Fox, Carlos Guestrin 
    Stochastic Gradient Hamiltonian Monte Carlo 
    ICML 2014.

    The inputs are:
    gradU = gradient of U
    eta = eps^2 M^(-1)
    niter = number of samples to generate
    alpha = eps M^(-1) C
    theta_0 = initial val of parameter(s) to be sampled
    V_hat = estimated covariance matrix of stoch grad noise
    See paper for more details

    The return is:
    A np.array of positions of theta.'''

    ### Initialization and checks ###
    # get dimension of the thing you're sampling
    p = len(theta_0)
    # set up matrix of 0s to hold samples
    n = dat.shape[0]
    theta_samps = np.zeros((p, niter*(n // batch_size)))
    # fix beta_hat as described on pg. 6 of paper
    beta_hat = 0.5 * V_hat @ eta
    # We're sampling from a N(0, 2(alpha - beta_hat) @ eta)
    # so this must be a positive definite matrix
    Sigma = 2 * (alpha - beta_hat) @ eta
    Sig_chol = np.linalg.cholesky(Sigma)
    if not is_pos_def( Sigma ): 
        print("Error: (alpha - beta_hat) eta not pos def")
        return
    # Need batch size to be <= the amount of data
    if (batch_size > dat.shape[0]): 
        print("Error: batch_size must be <= number of data points")
        return

    # initialize nu and theta 
    nu = np.random.multivariate_normal(np.zeros(p), eta).reshape(p,-1)
    theta = theta_0
    
    # set up for Chol decomp for MV normal sampling of nu every epoch
    eta_chol = np.linalg.cholesky(eta)
    
    # loop through algorithm to get niter samples
    it = 0
    for i in range(niter):
        dat_resh, nbatches = batch_data(dat, batch_size)
        
        # Resample momentum every epoch
        nu = eta_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
        
        for batch in range(nbatches):
            gradU_batch = gradU(theta, dat_resh[:,:,batch], n, batch_size).reshape(p,-1)
            nu = nu - eta @ gradU_batch - alpha @ nu + \
                 Sig_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
            theta = theta + nu
            theta_samps[:,it] = theta.reshape(-1,p)
            it = it + 1
        
    return theta_samps

Just try throwing `@jit` in front of things and seeing how much improvement we get.

In [45]:
@jit
def batch_data_numba(data, batch_size):
    n = data.shape[0]
    p = data.shape[1]
    if n % batch_size != 0:
        n = (n // batch_size) * batch_size
    ind = np.arange(n)
    np.random.shuffle(ind)
    n_batches = n // batch_size
    data = data[ind].reshape(batch_size, p, n_batches)
    return(data, n_batches)

In [46]:
@jit
def sghmc_numba(gradU, eta, niter, alpha, theta_0, V_hat, dat, batch_size):
    '''Define SGHMC as described in the paper
    Tianqi Chen, Emily B. Fox, Carlos Guestrin 
    Stochastic Gradient Hamiltonian Monte Carlo 
    ICML 2014.

    The inputs are:
    gradU = gradient of U
    eta = eps^2 M^(-1)
    niter = number of samples to generate
    alpha = eps M^(-1) C
    theta_0 = initial val of parameter(s) to be sampled
    V_hat = estimated covariance matrix of stoch grad noise
    See paper for more details

    The return is:
    A np.array of positions of theta.'''

    ### Initialization and checks ###
    # get dimension of the thing you're sampling
    p = len(theta_0)
    # set up matrix of 0s to hold samples
    n = dat.shape[0]
    theta_samps = np.zeros((p, niter*(n // batch_size)))
    # fix beta_hat as described on pg. 6 of paper
    beta_hat = 0.5 * V_hat @ eta
    # We're sampling from a N(0, 2(alpha - beta_hat) @ eta)
    # so this must be a positive definite matrix
    Sigma = 2 * (alpha - beta_hat) @ eta
    Sig_chol = np.linalg.cholesky(Sigma)
    if not is_pos_def( Sigma ): 
        print("Error: (alpha - beta_hat) eta not pos def")
        return
    # Need batch size to be <= the amount of data
    if (batch_size > dat.shape[0]): 
        print("Error: batch_size must be <= number of data points")
        return

    # initialize nu and theta 
    nu = np.random.multivariate_normal(np.zeros(p), eta).reshape(p,-1)
    theta = theta_0
    
    # set up for Chol decomp for MV normal sampling of nu every epoch
    eta_chol = np.linalg.cholesky(eta)
    
    # loop through algorithm to get niter samples
    it = 0
    for i in range(niter):
        dat_resh, nbatches = batch_data_numba(dat, batch_size)
        
        # Resample momentum every epoch
        nu = eta_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
        
        for batch in range(nbatches):
            gradU_batch = gradU(theta, dat_resh[:,:,batch], n, batch_size).reshape(p,-1)
            nu = nu - eta @ gradU_batch - alpha @ nu + \
                 Sig_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
            theta = theta + nu
            theta_samps[:,it] = theta.reshape(-1,p)
            it = it + 1
        
    return theta_samps

Maybe the batch_data_numba version won't actually help things... It may just add overhead.

In [47]:
@jit
def sghmc_numba2(gradU, eta, niter, alpha, theta_0, V_hat, dat, batch_size):
    '''Define SGHMC as described in the paper
    Tianqi Chen, Emily B. Fox, Carlos Guestrin 
    Stochastic Gradient Hamiltonian Monte Carlo 
    ICML 2014.

    The inputs are:
    gradU = gradient of U
    eta = eps^2 M^(-1)
    niter = number of samples to generate
    alpha = eps M^(-1) C
    theta_0 = initial val of parameter(s) to be sampled
    V_hat = estimated covariance matrix of stoch grad noise
    See paper for more details

    The return is:
    A np.array of positions of theta.'''

    ### Initialization and checks ###
    # get dimension of the thing you're sampling
    p = len(theta_0)
    # set up matrix of 0s to hold samples
    n = dat.shape[0]
    theta_samps = np.zeros((p, niter*(n // batch_size)))
    # fix beta_hat as described on pg. 6 of paper
    beta_hat = 0.5 * V_hat @ eta
    # We're sampling from a N(0, 2(alpha - beta_hat) @ eta)
    # so this must be a positive definite matrix
    Sigma = 2 * (alpha - beta_hat) @ eta
    Sig_chol = np.linalg.cholesky(Sigma)
    if not is_pos_def( Sigma ): 
        print("Error: (alpha - beta_hat) eta not pos def")
        return
    # Need batch size to be <= the amount of data
    if (batch_size > dat.shape[0]): 
        print("Error: batch_size must be <= number of data points")
        return

    # initialize nu and theta 
    nu = np.random.multivariate_normal(np.zeros(p), eta).reshape(p,-1)
    theta = theta_0
    
    # set up for Chol decomp for MV normal sampling of nu every epoch
    eta_chol = np.linalg.cholesky(eta)
    
    # loop through algorithm to get niter samples
    it = 0
    for i in range(niter):
        dat_resh, nbatches = batch_data(dat, batch_size) # use original batch_data
        
        # Resample momentum every epoch
        nu = eta_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
        
        for batch in range(nbatches):
            gradU_batch = gradU(theta, dat_resh[:,:,batch], n, batch_size).reshape(p,-1)
            nu = nu - eta @ gradU_batch - alpha @ nu + \
                 Sig_chol @ np.random.normal(size=p).reshape(p,-1) # sample from MV normal
            theta = theta + nu
            theta_samps[:,it] = theta.reshape(-1,p)
            it = it + 1
        
    return theta_samps

Look at the times for the above methods

In [48]:
%timeit sghmc(gradU, eta, niter, alpha, theta_0, V, x, batch_size)

1.69 s ± 26.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%timeit sghmc_cleaned(gradU, eta, niter, alpha, theta_0, V, x, batch_size)

1.52 s ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
%timeit sghmc_numba(gradU, eta, niter, alpha, theta_0, V, x, batch_size) # use batch_data_numba

1.5 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [51]:
%timeit sghmc_numba2(gradU, eta, niter, alpha, theta_0, V, x, batch_size) # use batch_data

1.51 s ± 9.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Next step should be to do something more fancy, like maybe parallelize the batch function loop or convert things to C++/Cython.