In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pickle
import scipy
import time
import tempfile
from tqdm import tqdm

from pyabc import (ABCSMC, RV,
                   PercentileDistanceFunction, DistanceFunction, sampler)
from pyabc import Distribution as abcDis

import sys 
sys.path.append('../../')
from model_comparison.utils import *
from model_comparison.mdns import *
from model_comparison.models import PoissonModel, NegativeBinomialModel

from delfi.distribution.mixture import MoG
%matplotlib inline

In [None]:
mpl_params = {'legend.fontsize': 14,
              'legend.frameon': False,
                      'axes.titlesize': 20,
                      'axes.labelsize': 17,
                      'xtick.labelsize': 12,
                      'ytick.labelsize': 12,
             'figure.figsize' : (18, 5)}

mpl.rcParams.update(mpl_params)

## Set up models

In [None]:
seed = 3
rng = np.random.RandomState(seed=seed)
time_stamp = time.strftime('%Y%m%d%H%M_')
figure_folder = '../figures/'

In [None]:
sample_size = 10
ntest = 500

k2 = 2.
theta2 = 1.0

k3 = 2.
theta3 = 2. 

# then the scale of the Gamma prior for the Poisson is given by
theta1 = 2.0
k1 = (k2 * theta2 * k3 * theta3) / theta1
print(k1)


model_poisson = PoissonModel(sample_size=sample_size, seed=seed, n_workers=1)
model_nb = NegativeBinomialModel(sample_size=sample_size, seed=seed, n_workers=1)

In [None]:
# from Gamma prior for Poisson 
prior_lam = scipy.stats.gamma(a=k1, scale=theta1)
prior_k = scipy.stats.gamma(a=k2, scale=theta2)
prior_theta = scipy.stats.gamma(a=k3, scale=theta3)

## Test data set 

In [None]:
n = ntest
params_poi_test = prior_lam.rvs(size=int(n / 2))
params_nb_test = np.vstack((prior_k.rvs(size=int(n / 2)), 
                       prior_theta.rvs(size=int(n / 2)))).T

In [None]:
data_poi_test = model_poisson.gen(params_poi_test)
data_nb_test = model_nb.gen(params_nb_test)

In [None]:
stats_poi_test = np.array([data_poi_test.mean(axis=1), data_poi_test.var(axis=1)]).T
stats_nb_test = np.array([data_nb_test.mean(axis=1), data_nb_test.var(axis=1)]).T

## PyABC SMC

In [None]:
# Define models oin pyabc style 
def model_1(parameters): 
    x = model_poisson.gen([parameters.lam])
    return {'y': np.array([x.mean(), x.var()])}

def model_2(parameters): 
    x = model_nb.gen([[parameters.k, parameters.theta]])
    return {'y': np.array([x.mean(), x.var()])}

# priors
prior1 = abcDis.from_dictionary_of_dictionaries(dict(lam={'type': 'gamma', 'kwargs': {'a':k1, 'scale': theta1}}))

prior2 = abcDis.from_dictionary_of_dictionaries(dict(k={'type': 'gamma', 'kwargs': {'a':k2, 'scale': theta2}}, 
                                                     theta={'type': 'gamma', 'kwargs': {'a':k3, 'scale': theta3}}))

models = [model_1, model_2]
parameter_priors = [prior1, prior2]

class MyDist(DistanceFunction): 
    
    def __call__(self, x, y): 
        return np.power(x['y'] - y['y'], 2).mean()      
    
    
class MyModelPrior(RV): 
    
    def rvs(self, *args, **kwargs):
        model_idx_vector = self.distribution.rvs(*args, **kwargs)[0]
        return np.where(model_idx_vector)[0][0]
    
    def pmf(self, x, *args, **kwargs):
        xv = [0, 0]
        xv[x] = 1
        return self.distribution.pmf(xv, *args, **kwargs)
        

## SMC with single round = rejection sampling

In [None]:
test_set = np.vstack((stats_poi_test, stats_nb_test))
phat_rej = np.zeros((ntest, 2))
model_prior = MyModelPrior.from_dictionary({'type': 'multinomial', 'kwargs': {'n': 1, 'p': [0.5, 0.5]}})

for ii in tqdm.tqdm(range(ntest)): 
    sxo = test_set[ii, ]

    # We plug all the ABC options together
    abc = ABCSMC(
        models, parameter_priors, MyDist(), model_prior=model_prior,
         sampler=sampler.SingleCoreSampler(), population_size=75)

    # and we define where to store the results
    db_path = ("sqlite:///" +
               os.path.join(tempfile.gettempdir(), "test.db"))
    abc_id = abc.new(db_path, {"y": sxo})
    history = abc.run(minimum_epsilon=0.05, max_nr_populations=1)
    model_probabilities = history.get_model_probabilities()
    print(history.total_nr_simulations)
    ppoi = model_probabilities[0][0]
    phat_rej[ii, 0] = ppoi
    phat_rej[ii, 1] = 1 - ppoi

## SMC with multiple round

In [None]:
test_set = np.vstack((stats_poi_test, stats_nb_test))
phat_smc = np.zeros((ntest, 2))
n_rounds = 3
n_simulations = 0

for ii in tqdm.tqdm(range(ntest)): 
    sxo = test_set[ii, ]

    # We plug all the ABC options together
    abc = ABCSMC(
        models, parameter_priors,
        MyDist(), population_size=20)

    # and we define where to store the results
    db_path = ("sqlite:///" +
               os.path.join(tempfile.gettempdir(), "test.db"))
    abc_id = abc.new(db_path, {"y": sxo})
    history = abc.run(minimum_epsilon=0.05, max_nr_populations=n_rounds)
    model_probabilities = history.get_model_probabilities()
    n_simulations += history.total_nr_simulations
    print(n_simulations)
    ppoi = model_probabilities[0][n_rounds - 1]
    phat_smc[ii, 0] = ppoi
    phat_smc[ii, 1] = 1 - ppoi
    print(model_probabilities)

## Generate data sets with similar number of samples as used by SMC

In [None]:
n = n_simulations
params_poi = prior_lam.rvs(size=int(n / 2))
params_nb = np.vstack((prior_k.rvs(size=int(n / 2)), 
                       prior_theta.rvs(size=int(n / 2)))).T

In [None]:
data_poi = model_poisson.gen(params_poi)
data_nb = model_nb.gen(params_nb)

## Calculate stats 

In [None]:
stats_poi = np.array([data_poi.mean(axis=1), data_poi.var(axis=1)]).T
stats_nb = np.array([data_nb.mean(axis=1), data_nb.var(axis=1)]).T

## Calculate true posterior probs

In [None]:
xtest = np.vstack((data_poi_test, data_nb_test))

In [None]:
ppoi_exact = []
for xi in tqdm.tqdm(xtest): 
    nb_logevi = calculate_nb_evidence(xi, k2, theta2, k3, theta3, log=True)
    poi_logevi = poisson_evidence(xi, k=k1, theta=theta1, log=True)
    ppoi_exact.append(calculate_pprob_from_evidences(np.exp(poi_logevi), np.exp(nb_logevi)))

## Do density estimation with same training set 

In [None]:
# shuffle and set up model index target vector 
x_all = np.vstack((data_poi, data_nb))

# define model indices
m_all = np.hstack((np.zeros(data_poi.shape[0]), np.ones(data_nb.shape[0]))).squeeze().astype(int).tolist()

# get shuffled indices 
# shuffle_indices = np.arange(n)
# np.random.shuffle(shuffle_indices)

x, x_test = x_all[:ntrain, :], x_all[ntrain:, :]
m, m_test = m_all[:ntrain], m_all[ntrain:]

# calculate summary stats
sx = calculate_stats_toy_examples(x)
# sx_test = calculate_stats_toy_examples(xtest)
# use training norm to normalize test data 
sx_zt, training_norm = normalize(sx)
# sx_test_zt, training_norm = normalize(sx_test, training_norm)

In [None]:
model = ClassificationMDN(n_input=2, n_hidden_units=10, n_hidden_layers=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
trainer = Trainer(model, optimizer, verbose=True, classification=True)

n_epochs = 10
n_minibatch = int(ntrain / 100)

# train with training data
loss_trace = trainer.train(sx_zt, m, n_epochs=n_epochs, n_minibatch=n_minibatch)
plt.plot(loss_trace)
plt.ylabel('loss')
plt.xlabel('iterations');

## Prior checks

In [None]:
prior_probs = np.arange(0.1, 1., 0.2)
post_probs_mean = np.zeros_like(prior_probs)
sx_test_m1 = stats_poi_test[:100]
sx_test_m2 = stats_nb_test[:100]
n_rounds = 3

for ii, pp in enumerate(prior_probs): 
    idx = int(pp * 100)
    # up to idx for m1
    d1 = sx_test_m1[:idx, ]
    # from idx to end for m2
    d2 = sx_test_m2[idx:, ]
    test_set = np.vstack((d1, d2))
    model_prior = MyModelPrior.from_dictionary({'type': 'multinomial', 'kwargs': {'n': 1, 'p': [pp, 1 - pp]}})
    
    # rejection sampling 
    ppoi = np.zeros(test_set.shape[0])
    n_simulations = 0
    for jj in tqdm.tqdm(range(test_set.shape[0])): 
        sxo = test_set[jj, ]

        # We plug all the ABC options together
        abc = ABCSMC(
            models, parameter_priors, 
            MyDist(), model_prior=model_prior)

        # and we define where to store the results
        db_path = ("sqlite:///" +
                   os.path.join(tempfile.gettempdir(), "test.db"))
        abc_id = abc.new(db_path, {"y": sxo})
        history = abc.run(minimum_epsilon=0.05, max_nr_populations=n_rounds)
        model_probabilities = history.get_model_probabilities().as_matrix()
        n_simulations += history.total_nr_simulations
        try: 
            ppoi[jj] = model_probabilities[0, model_probabilities.shape[0] - 1]
        except:
            ppoi[jj] = model_probabilities[model_probabilities.shape[0] - 1, 0]

    print(n_simulations)
       
    post_probs_mean[ii] = ppoi.mean()

In [None]:
stats_poi.shape, data_poi.shape

In [None]:
sx_test_m1 = stats_poi_test[:100]
sx_test_m2 = stats_nb_test[:100]
prior_probs = np.arange(0.1, 1., 0.1)
post_probs_mean_de = np.zeros_like(prior_probs)

for ii, pp in enumerate(prior_probs): 
    idx = int(pp * 100)
    # up to idx for m1
    d1 = sx_test_m1[:idx, ]
    # from idx to end for m2
    d2 = sx_test_m2[idx:, ]
    test_set = np.vstack((d1, d2))
    
    # learn new abc model with given prior on training set 
    loop_model = ClassificationMDN(n_input=2, n_hidden_units=10, n_hidden_layers=1)
    optimizer = torch.optim.Adam(loop_model.parameters(), lr=0.01)
    trainer = Trainer(loop_model, optimizer, verbose=True, classification=True)

    # train with training data
    ntrain = stats_poi.shape[0]  # training size 
    training_set_idx = int(pp * ntrain)  # set prior index 
    sx_loop = np.vstack((stats_poi[:training_set_idx, ], stats_nb[training_set_idx:, ]))
    m_loop = np.hstack((np.zeros(training_set_idx), np.ones(ntrain - training_set_idx))).astype(int).tolist()
    
    sx_loop_zt, loop_norm = normalize(sx_loop)

    n_epochs = 10
    n_minibatch = int(ntrain / 100)

    loss_trace = trainer.train(sx_loop_zt, m_loop, n_epochs=n_epochs, n_minibatch=n_minibatch)
    # predict with abc model 
    test_data_zt, _ = normalize(test_set, loop_norm)
    p = loop_model.predict(test_data_zt)[:, 0]
    post_probs_mean_de[ii] = p.mean()

In [None]:
plt.plot(np.arange(.1, 1., .2), post_probs_mean, 'o-')
plt.plot(prior_probs, post_probs_mean_de, 'o-')
plt.plot(prior_probs, prior_probs)

## Posterior checks

In [None]:
sx_test = np.vstack((stats_poi_test, stats_nb_test))
sx_test_zt, training_norm = normalize(sx_test, training_norm)

In [None]:
ppoi_hat = model.predict(sx_test_zt)[:, 0]

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(ppoi_exact, '-o', label='exact')
plt.plot(phat_rej[:, 0], '-o', label='rejection abc')
plt.plot(phat_smc[:, 0], '-o', label='SMC abc')
plt.plot(ppoi_hat, '-o', label='Density abc')
plt.legend(fontsize=16)
plt.tight_layout()

In [None]:
np.mean(np.abs(ppoi_exact - phat_rej[:, 0]))

In [None]:
np.mean(np.abs(ppoi_exact - phat_smc[:, 0]))

In [None]:
np.mean(np.abs(ppoi_exact - ppoi_hat))

In [None]:
test_set = np.vstack((stats_poi_test, stats_nb_test))
d = dict(x_test=xtest, sx_test=test_set, x=x, sx=sx, 
         ppoi_exact=ppoi_exact, ppoi_hat=ppoi_hat, 
         ppoi_smc=phat_smc[:, 0], ppoi_rej=phat_rej[:, 0], 
         prior_probs=prior_probs, 
         post_probs_mean_hat=post_probs_mean_de, 
         post_probs_mean_smc=post_probs_mean)

In [None]:
fn = time_stamp + '_modelposterior_comparison_exact_DE_SMC_ns{}_ntest{}.p'.format(sample_size, ntest)
with open(os.path.join('../data', fn), 'wb') as outfile: 
    pickle.dump(d, outfile, protocol=pickle.HIGHEST_PROTOCOL)