In [1]:
from collections import defaultdict
from scipy import stats
from sklearn.datasets import load_digits
import numpy as np

In [2]:
class MCMCSampler():

    def __init__(self, log_pstar, covariance, data, **kwargs):
        self.log_pstar  = log_pstar
        self.covariance = covariance
        self.data       = data
        self.kwargs     = kwargs

    def transition(self, theta):
        return stats.multivariate_normal(theta, self.covariance).rvs()

    def get_samples(self, burn_period = 0.2):
        if isinstance(burn_period, float):
            burn_period = int(burn_period * len(self.samples) + 1)
        return self.samples[burn_period:]

def check_verbose(N, verbose):
    if isinstance(verbose, str):
        if verbose == 'auto':
            temp = 10 ** np.floor(np.log10(N) - 1)
            k    = np.array([1, 2, 5])
            arg  = np.fabs(N // temp / k - 10).argmin()
            verbose = int(temp * k[arg])
        else:
            raise Exception()
    elif isinstance(verbose, str):
        verbose = int(N * verbose + 0.5)

    assert isinstance(verbose, int)
    assert 0 < verbose < N

    return verbose

def softmax(z):
    e = np.exp(z - z.max(axis = -1, keepdims = True)) # numerical stability
    return e / e.sum(axis = -1, keepdims = True)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

class Verbose():

    def __init__(self, N, verbose):
        self.N = N
        self.verbose = verbose
        self.num = max(len(f'{N:,d}'), len('iteration'))

        space = max(len('iteration') - self.num, 0)

        print(' ' * space + 'iteration | log pstar')
        print('-' * space + '----------+----------')

    def print(self, i, val, end = '\n'):
        print(f'\r{i:>{self.num},d} | {val:+.2e}', end = end)

class MetropolisHastingsSampler(MCMCSampler):

    def __init__(self, log_pstar, covariance, data, **kwargs):
        super().__init__(log_pstar, covariance, data, **kwargs)

    def fit(self, n_samples, theta0, verbose = 'auto', random_state = None):

        if random_state is not None:
            np.random.seed(random_state)

        N                  = n_samples + 1 # add 1 to include theta0
        m                  = len(theta0)   # no. of parameters
        
        assert n_samples > 0
        assert isinstance(theta0, np.ndarray) and (theta0.ndim == 1)

        verbose            = check_verbose(N, verbose)

        # samples
        self.samples       = np.empty((N, m))
        self.samples[0]    = theta0

        # record of all log_pstar evaluations
        self.log_pstars    = np.empty(N)
        self.log_pstars[0] = self.log_pstar(theta0, self.data, **self.kwargs)

        # acceptance indicator for all new samples
        self.acceptance    = np.zeros(N - 1, dtype = bool)

        if verbose:
            message = Verbose(N, verbose)
            message.print(0, self.log_pstars[0])

        for i in range(1, N):

            # sample a new theta
            theta = self.transition(self.samples[i - 1])

            # compute log pstar of new theta
            logp  = self.log_pstar(theta, self.data, **self.kwargs)

            # accept with p_new / p_old probability
            if np.log(np.random.uniform()) < (logp - self.log_pstars[i - 1]):
                self.samples[i]        = theta
                self.log_pstars[i]     = logp
                self.acceptance[i - 1] = True

            # reject and add the previous sample
            else:
                self.samples[i]        = self.samples[i - 1]
                self.log_pstars[i]     = self.log_pstars[i - 1]
            
            if verbose:
                message.print(i, self.log_pstars[i], '' if i % verbose else '\n')

        return self

class GibbsSampler(MCMCSampler):

    def __init__(self, log_pstar, deviation, data, **kwargs):
        super().__init__(log_pstar, deviation, data, **kwargs)

        self.__dict__['deviation'] = self.__dict__.pop('covariance')

    def transition(self, value, j):
        return stats.norm(value, self.deviation[j]).rvs()

    def fit(self, n_samples, theta0, verbose = 'auto', random_state = None):

        if random_state is not None:
            np.random.seed(random_state)

        N                  = n_samples + 1 # add 1 to include theta0
        m                  = len(theta0)   # no. of parameters

        assert n_samples > 0
        assert isinstance(theta0, np.ndarray) and (theta0.ndim == 1)

        verbose = check_verbose(N, verbose)

        # samples
        self.samples       = np.empty((N, m))
        self.samples[0]    = theta0

        # record of all log_pstar evaluations
        self.log_pstars    = np.empty(N)
        self.log_pstars[0] = self.log_pstar(theta0, self.data, **self.kwargs)

        # acceptance rate indicator for all new samples (per parameter)
        self.acceptance    = np.zeros((N - 1, m), dtype = bool)

        if verbose:
            message = Verbose(N, verbose)
            message.print(0, self.log_pstars[0])

        for i in range(1, N):

            # theta and logp baseline
            theta_baseline = self.samples[i - 1]
            logp_baseline  = self.log_pstars[i - 1]

            # loop through each parameter in theta
            for j in range(m):
                
                # copy most recent theta baseline
                theta    = theta_baseline.copy()

                # sample the j-th element
                theta[j] = self.transition(theta[j], j)
                
                # compute log_pstar for theta
                logp     = self.log_pstar(theta, self.data, **self.kwargs)

                # accept with p_new / p_old probability
                if np.log(np.random.uniform()) < (logp - logp_baseline):
                    theta_baseline = theta
                    logp_baseline  = logp
                
                # reject the new sample
                else:
                    self.acceptance[i - 1,j] = False
                
            # append new sample and log_pstar values
            self.samples[i]    = theta_baseline
            self.log_pstars[i] = logp_baseline
            
            if verbose:
                message.print(i, self.log_pstars[i], '' if i % verbose else '\n')

        return self


class AdaptiveGibbsSampler(MCMCSampler):

    def __init__(self, log_pstar, deviation, data, rate = 0.8, **kwargs):

        assert isinstance(rate, float) and 0 < rate < 1

        super().__init__(log_pstar, deviation, data, **kwargs)

        self.__dict__['deviation'] = self.__dict__.pop('covariance')
        self.rate = rate

    def transition(self, value, j):
        return stats.norm(value, self.deviation[j]).rvs()

    def fit(self, n_samples, theta0, verbose = 'auto', random_state = None):

        if random_state is not None:
            np.random.seed(random_state)

        N                  = n_samples + 1 # add 1 to include theta0
        m                  = len(theta0)   # no. of parameters

        assert n_samples > 0
        assert isinstance(theta0, np.ndarray) and (theta0.ndim == 1)

        verbose = check_verbose(N, verbose)

        # samples
        self.samples       = np.empty((N, m))
        self.samples[0]    = theta0

        # record of all log_pstar evaluations
        self.log_pstars    = np.empty(N)
        self.log_pstars[0] = self.log_pstar(theta0, self.data, **self.kwargs)

        # acceptance rate indicator for all new samples
        self.acceptance    = {}

        # log_alpha for the dirichlet prior
        log_alpha          = np.zeros(m)

        if verbose:
            message = Verbose(N, verbose)
            message.print(0, self.log_pstars[0])

        for i in range(1, N):
            
            # theta and logp baseline
            theta_baseline = self.samples[i - 1]
            logp_baseline  = self.log_pstars[i - 1]

            # initialise the acceptance rate for the i-th sample to be a dictionary with list values
            self.acceptance[i] = defaultdict(list)

            for _ in range(m):
                
                # compute alpha
                alpha    = softmax(log_alpha) + 1e-8             # numerical stability as alpha needs to be greater than 0

                # sample the j-th parameter to update
                j        = stats.dirichlet(alpha).rvs().argmax() # draws m numbers but select the one with the highest value

                # copy most recent theta baseline
                theta    = theta_baseline.copy()

                # sample the j-th element
                theta[j] = self.transition(theta[j], j)
                
                # compute log_pstar for theta
                logp     = self.log_pstar(theta, self.data, **self.kwargs)

                # accept with probability p_new / p_old
                if np.log(np.random.uniform()) < (logp - logp_baseline):
                    
                    # increment the j-th element of log_alpha by the improvement in log_pstar value
                    log_alpha[j]  += logp - logp_baseline

                    theta_baseline = theta
                    logp_baseline  = logp
                    self.acceptance[i][j].append(True)
                    
                # reject the new sample
                else:
                    self.acceptance[i][j].append(False)

                # pull all values towards 0 (this prevents exploding values)
                # encourages indices that have not been picked
                log_alpha *= self.rate
                
            self.samples[i]    = theta_baseline
            self.log_pstars[i] = logp_baseline
            
            if verbose:
                message.print(i, self.log_pstars[i], '' if i % verbose else '\n')
                
        return self


In [3]:
X, y = load_digits(return_X_y = True)
    
def log_pstar_classification(theta, data, **kwargs):
    X, y = data
    p    = len(np.unique(y))
    bw   = theta.reshape(-1, p)
    b, w = bw[0], bw[1:]
    hat  = softmax(X @ w + b)
    return np.log(hat[range(len(hat)), y]).sum()

def log_pstar_binary_classification(theta, data, **kwargs):
    X, y = data
    bw   = theta.reshape(X.shape[1] + 1)
    b, w = bw[0], bw[1:]
    hat  = sigmoid(X @ w + b)
    return np.log(hat[y == 1]).sum() - np.log(1 - hat[y == 0]).sum()

def log_pstar_regression(theta, data, noise_deviation = 0.2, **kwargs):
    X, y = data
    b, w = theta[0], theta[1:]
    return stats.norm(X @ w + b, noise_deviation).logpdf(y).sum()

In [4]:
N         = 200
theta0    = np.zeros((X.shape[1] * 10 + 10))
deviation = np.ones_like(theta0) * 0.1 # increase to 0.1
data      = (X, y)

In [5]:
mh = MetropolisHastingsSampler(log_pstar_classification, deviation, data)

mh.fit(N, theta0)

# compare the average of the samples to the true values
# print(mcmc.get_samples(500).mean(axis = 0), np.append(b, w))

# # norm distance
# np.linalg.norm(mcmc.get_samples(500).mean(axis = 0) - np.append(b, w))

iteration | log pstar
----------+----------
        0 | -4.14e+03
       20 | -4.14e+03
       21 | -4.14e+03

KeyboardInterrupt: 

In [None]:
gibbs = GibbsSampler(log_pstar_classification, deviation, data)

gibbs.fit(N, theta0)

# compare the average of the samples to the true values
# print(mcmc.get_samples(500).mean(axis = 0), np.append(b, w))

# # norm distance
# np.linalg.norm(mcmc.get_samples(500).mean(axis = 0) - np.append(b, w))

iteration | log pstar
----------+----------
        0 | -4.14e+03
       20 | -2.04e+02
       40 | -1.34e+02
       60 | -1.07e+02
       80 | -1.01e+02
      100 | -8.69e+01
      120 | -8.50e+01
      140 | -7.70e+01
      160 | -6.04e+01
      180 | -6.72e+01
      200 | -7.20e+01


<__main__.GibbsSampler at 0x7f1964c5e890>

In [None]:
adaptive = AdaptiveGibbsSampler(log_pstar_classification, deviation, data)

adaptive.fit(N, theta0)

# compare the average of the samples to the true values
# print(mcmc.get_samples(500).mean(axis = 0), np.append(b, w))

# # norm distance
# np.linalg.norm(mcmc.get_samples(500).mean(axis = 0) - np.append(b, w))

iteration | log pstar
----------+----------
        0 | -4.14e+03
       20 | -2.12e+02
       25 | -1.83e+02

KeyboardInterrupt: 