In [16]:
%run helpers.ipynb
%run constrained_distributions.ipynb
%run processors.ipynb

In [6]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, multinomial, invwishart, rv_continuous
from scipy.linalg import sqrtm, inv
from numpy.random import uniform, dirichlet
from numpy.linalg import slogdet
from copy import deepcopy
import statistics

In [18]:
def make_gibbs_update(dat, param, hyp, alpha, labels):
    NIW = [x[0] for x in param]
    prop = [x[1] for x in param]
    z = [x[2] for x in param]

    kappa0, mu0, Psi0 = hyp

    nw = len(prop)
    nm = len(prop[0])
    n, dm = np.shape(dat)

    # Sample new mean estimates from constrained MVN distribution in each cluster
    NIW_out = np.copy(NIW)
    for i in range(nw):
        rles = np.array(np.unique(np.sort(z[i]), return_counts=True))
        nz = np.zeros(nm)
        nz[rles[0]-1] = rles[1]

        # xbar is a list of dm dictionaries
        xbar = dat.groupby(z[0]).mean()
        for m in range(nm):
            if nz[m] >= dm:
                # Compute this only once because it gets reused a lot here
                xbarmap = [xbar[x][m+1] for x in xbar]

                # Extract the current estimate for Sigma for this cluster
                Sigma_hat = NIW[i][m].get("Sigma", 0)

                # Pre-compute some inverses we use multiple times
                iS0 = np.linalg.inv(Sigma_hat / max(kappa0[m], dm))
                iS = np.linalg.inv(Sigma_hat)

                # Check if we are ok for mu
                likelihood_check = np.full(dm, False)
                rcMVN_in = np.dot(np.linalg.inv(iS0 + nz[m] * iS), (iS0.dot(mu0[m, :]) + nz[m] * iS.dot(xbarmap)))

                for j in range(dm):
                    if np.sign(rcMVN_in[j]) != np.sign(labels[m][j]) and np.sign(labels[m][j]) != 0:
                        likelihood_check[j] = True

                if np.any(likelihood_check):
                    # Sample from the prior for mu
                    mu = np.random.multivariate_normal(mu0[m, :], np.round(Sigma_hat, decimals=8), 1)
                else:
                    mu = np.random.multivariate_normal(np.linalg.inv(iS0 + nz[m] * iS).dot(rcMVN_in), np.linalg.inv(iS0 + nz[m] * iS), 1)

                NIW_out[i][m]["mu"] = mu[0]
            else:
                # Draw from the prior
                Sigma_hat = NIW[i][m].get("Sigma", 0)
                mu = np.random.multivariate_normal(mu0[m, :], np.round(Sigma_hat, decimals=8), 1)
                NIW_out[i][m]["mu"] = mu[0]

    # Sample new cluster labels
    zout = np.copy(z)
    for i in range(nw):
        distns = [multivariate_normal(NIW_out[i][m]["mu"], np.round(NIW_out[i][m]["Sigma"], decimals=8)) for m in range(nm)]
        p = np.zeros((n, nm))
        for m in range(nm):
            p[:, m] = distns[m].pdf(dat) * prop[i][m]

        zout[i] = np.apply_along_axis(lambda x: np.random.choice(np.arange(nm), p=x / np.sum(x)), axis=1, arr=p)

    # Sample new cluster weights
    propout = np.copy(prop)
    for i in range(nw):
        rles = np.unique(zout[i], return_counts=True)
        nz = np.zeros(nm)
        nz[rles[0]] = rles[1]
        propout[i] = np.random.dirichlet(np.add(alpha,nz))

    return zout, NIW_out, propout

In [19]:
def log_likelihood(dat, param, nz, walker_num, cluster_num, curr_Sigma, Sigma_star, mu_hat):
    NIW = [x[0] for x in param]
    prop = [x[1] for x in param]
    z = [x[2] for x in param]

    n, dm = np.shape(dat)
    nm = np.shape(prop[walker_num])[0]
    subdat = dat[np.array(z[walker_num]) == cluster_num + 1].values

    # If there are enough observations in the cluster, compute the log difference of the likelihoods
    if nz >= dm:
        mvn_distn_star = multivariate_normal(mu_hat, Sigma_star)
        mvn_distn_curr = multivariate_normal(mu_hat, curr_Sigma)
        ll_normal = np.sum(mvn_distn_star.logpdf(subdat)) - np.sum(mvn_distn_curr.logpdf(subdat))
    else:
        ll_normal = 0.0

    # Establish the multivariate distributions that describe the mixture
    distns_curr = [multivariate_normal(np.array(x['mu']), np.array(x["Sigma"])) for x in NIW[walker_num]]
    distns_star = distns_curr.copy()
    distns_star[cluster_num] = multivariate_normal(np.array(NIW[walker_num][cluster_num]["mu"]), Sigma_star)

    # Compute density of the data in each mixture component
    p_curr = np.empty((n, nm))
    for m in range(nm):
        p_curr[:, m] = distns_curr[m].pdf(dat) * prop[walker_num][m]

    p_star = p_curr.copy()
    p_star[:, cluster_num] = distns_star[cluster_num].pdf(dat) * prop[walker_num][cluster_num]

    # Convert density to probabilities
    prob_curr = np.apply_along_axis(lambda x: x / np.sum(x) if not np.all(x == 0.0) else np.ones_like(x),
                                     axis=1, arr=p_curr)
    prob_star = np.apply_along_axis(lambda x: x / np.sum(x) if not np.all(x == 0.0) else np.ones_like(x),
                                     axis=1, arr=p_star)

    # Plug probabilities into the multinomial likelihood
    mult_distns_curr = [multinomial(1, x) for x in prob_curr]
    ll_mult_curr = np.zeros(n)
    for i in range(n):
        occ = np.zeros(nm, dtype=int)
        occ[z[walker_num][i]-1] = 1
        ll_mult_curr[i] = mult_distns_curr[i].logpmf(occ)

    mult_distns_star = [multinomial(1, x) for x in prob_star]
    ll_mult_star = np.zeros(n)
    for i in range(n):
        occ = np.zeros(nm, dtype=int)
        occ[z[walker_num][i]-1] = 1
        ll_mult_star[i] = mult_distns_star[i].logpmf(occ)

    sum1 = np.sum(ll_mult_star)
    sum2 = np.sum(ll_mult_curr)
    if np.isnan(sum1 - sum2):
        return ll_normal
    else:
        return sum1 - sum2 + ll_normal

In [20]:
def logprior(curr_Sigma, Sigma_star, mu_hat, hyp, cluster_num):
    """
    The prior is based on two main parts
        1. the density of curr_Sigma and Sigma_star given the prior on Sigma
        2. the density of the current estimate of mu given the MVNormal
            parameterized by either curr_Sigma or Sigma_star
    """
    kappa0, mu0, Psi0 = hyp
    dm = curr_Sigma.shape[0]
    mnz = max(kappa0[cluster_num], dm + 2)

    norm_distn_curr = multivariate_normal(mean=mu0[cluster_num,:], cov=curr_Sigma)
    norm_distn_star = multivariate_normal(mean=mu0[cluster_num,:], cov=Sigma_star)
    invwish_distn = invwishart(df=mnz, scale=Psi0[:,:,cluster_num])

    linvwish_ratio = invwish_distn.logpdf(Sigma_star / (mnz - dm - 1)) - invwish_distn.logpdf(curr_Sigma / (mnz - dm - 1))
    lnorm_ratio = norm_distn_star.logpdf(mu_hat) - norm_distn_curr.logpdf(mu_hat)

    return linvwish_ratio + lnorm_ratio

In [21]:
def get_lhastings(curr_Sigma, Sigma_star, tune_df):
    """
    The ratio of two Wishart distributions (all indicators cancel out)
    """
    dm = Sigma_star.shape[0]

    term1 = ((2*tune_df-dm-1) / 2) * (slogdet(curr_Sigma)[1] - slogdet(Sigma_star)[1])
    term2 = (np.trace(inv(curr_Sigma).dot(Sigma_star)) - np.trace(inv(Sigma_star).dot(curr_Sigma))) * tune_df / 2

    return term1 + term2

In [22]:
def propose_Sigma(curr_Sigma, lab, tune_df):
    """
    Draw a covariance from the constrained Wishart distribution
    """
    return rand_constrained_Wish(curr_Sigma, tune_df, lab) / tune_df

In [23]:
def make_mcmc_move(dat, param, hyp, alpha, labels, tune_df):
    """
    Make proposals cluster-at-a-time
    For current parameter estimates and current cluster,
    propose a new covariance according to the constrained Wishart
    Return acpt, which tracks acceptances in each cluster
    (1 => accept, 0 => reject)
    """
    NIW = [x[0] for x in param]
    prop = [x[1] for x in param]
    z = [x[2] for x in param]
    kappa0, mu0, Psi0 = hyp

    nm = np.array(prop[0]).shape[0]
    nw = len(prop)

    # Tracking the acceptance rate
    acpt = np.zeros(nm)

    # For each walker, for each cluster, sample a covariance from the cWISH
    for i in range(nw):
        rles = np.array(np.unique(np.sort(z[0]), return_counts=True))
        nz = np.zeros(nm)
        nz[rles[0]-1] = rles[1]

        mu_hats = [x['mu'] for x in NIW[i]]
        curr_Sigmas = [x['Sigma'] for x in NIW[i]]
        for m in range(nm):
            
            # Draw Sigma star from constrained Wishart distribution
            Sigma_star = propose_Sigma(curr_Sigmas[m], labels[m], tune_df[m])
            lhaste = get_lhastings(curr_Sigmas[m], Sigma_star, tune_df[m])

            # Compute the log prior for this proposal - log prior for the current estimate
            lp = logprior(curr_Sigmas[m], Sigma_star, mu_hats[m], hyp, m)

            # Compute the log likelihood for this proposal - log likelihood for the current estimate
            ll = log_likelihood(dat, param, nz[m], i, m, curr_Sigmas[m], Sigma_star, mu_hats[m])

            # If random uniform small enough, update Sigma to Sigma_star
            if np.log(np.random.uniform(0, 1)) < (ll + lp + lhaste):
                NIW[i][m]["Sigma"] = Sigma_star
                acpt[m] = 1

    return (NIW, acpt) #tuple

In [24]:
def run_mcmc(dat, param, hyp, alpha, nstep, labels, tune_df, opt_rate=0.3):
    """
    The function will return `(chain, acpt_chain, tune_df_chain)` where
    each returned chain value has an extra dimension appended counting steps of the
    chain (so `chain` is of shape `(ndim, nwalkers, nstep)`, for example).
    * acpt_chain tracks the acceptance rate for each cluster across the chain
    * tune_df_chain tracks the tuning degrees of freedom in the wishart proposal across the chain
    * dat is an n x nd array of observations
    * alpha is an nm array of hyperparameters for the mixing proportions
    * nstep = the number of steps of the already tuned mcmc
    * nd = number of dimensions
    * nw = number of walkers
    * param contains current parameter estimates for each dimension
        across walkers
    """

    labels = [[int(digit) - 1 for digit in x] for x in labels] #list(13)

    nw = len(param)
    n, nd = dat.shape
    nm = param[0][1].shape[0]
    
    chain = [[None]]
    for j in range(nw):
        chain[j][0] = copy.deepcopy(param[j])

    acpt_tracker = np.zeros(nm)
    
    # For tracking the acceptance rate, per cluster, for the adaptive tuning variance
    win_len = min(nstep, 50)
    acpt_win = np.zeros((nm, win_len))
    acpt_chain = np.zeros((nm, nstep))
    
    old_tune_df = copy.deepcopy(tune_df)
    tune_df_chain = np.zeros((nm, nstep))

    for i in range(nstep):
        # Proposes IW cluster at a time, accepts/rejects/returns new IW
        NIW, acpt = make_mcmc_move(dat, chain[i], hyp, alpha, labels, tune_df)
        acpt_tracker += acpt
        acpt_win[:, i % win_len] = acpt

        # Makes Gibbs updates of means, mixing weights, and class labels
        newz, newNIW, newprop = make_gibbs_update(dat, chain[i], hyp, alpha, labels)

        if i > 49:
            # Update tuning parameter per cluster
            gamma1 = 10 / (i ** 0.8)
            old_tune_df = deepcopy(tune_df)
            for m in range(nm):
                tune_df[m] = update_tune_df(tune_df[m], np.mean(acpt_win[m,:]), opt_rate, gamma1)

        for j in range(nw):
            chain_link = copy.deepcopy(param)

            chain_link[j][0] = newNIW[j].tolist()
            chain_link[j][1] = newprop[j]
            chain_link[j][2] = newz[j] + 1

            chain.append(chain_link)
            acpt_chain[:, i] = acpt_tracker / (i + 1)
            tune_df_chain[:, i] = tune_df

    return chain, acpt_chain, tune_df_chain