## Model index posterior prior checks 

Generate several test data sets with different model priors. Predict for each sample in the data set the model index and check whether the relative frequencies of predicted models matches the prior probability. Because this is the ground truth case we can also check the ground truth. 

This check is not suited so well for our task. This is because it checks whether the relative frequencies of the predictions (model 0, model 1) match. However, the objective of the method is to predict the posterior probability and not the actually correct model. Thus, this test should be based on the assumption that there is enough data available so that the true posterior probabilities will be very close 0 or 1 for the respective model. 

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os 
import pickle
import sys 
import time
import tqdm

sys.path.append('../../')
from model_comparison.utils import *
from model_comparison.mdns import *
from model_comparison.models import PoissonModel, NegativeBinomialModel
%matplotlib inline

In [None]:
mpl_params = {'legend.fontsize': 15,
              'legend.frameon': False,
                      'axes.titlesize': 20,
                      'axes.labelsize': 17,
                      'xtick.labelsize': 12,
                      'ytick.labelsize': 12,
             'figure.figsize' : (18, 5)}

mpl.rcParams.update(mpl_params)

In [None]:
folder = '../data/'
fn = '201804052103__poisson_posterior_trained_N100000M100_k2.p'
time_stamp = fn[:fn.find('_')]

with open(os.path.join(folder, fn), 'rb') as f: 
    d = pickle.load(f)
    
# set the seed for generating new test data 
seed = 5
np.random.seed(seed)

In [None]:
d_model = d['d_model_post']
d_model.keys()

In [None]:
model = d_model['model_models']
x = d_model['x']
sx = d_model['sx']
xtest = d_model['xtest']
mtest = d_model['mtest']
ppoi_exact = d_model['ppoi_exact']
sx_test = d_model['sx_test']
sample_size = d_model['sample_size']

training_norm = d_model['training_norm']
k1, k2, k3 = d_model['k1'], d_model['k2'], d_model['k3']
theta1, theta2, theta3 = d_model['theta1'], d_model['theta2'], d_model['theta3']

# priors 
prior_lambda = scipy.stats.gamma(a=k1, scale=theta1)
prior_k = scipy.stats.gamma(a=k2, scale=theta2)
prior_theta = scipy.stats.gamma(a=k3, scale=theta3)

model_poisson = PoissonModel(sample_size=sample_size, seed=seed)
model_nb = NegativeBinomialModel(sample_size=sample_size, seed=seed)

In [None]:
ntest = 1000
model_priors_poisson = np.arange(0.1, 1., 0.1)
model_freq_poisson = np.zeros_like(model_priors_poisson)

for ii, model_prior_poisson in enumerate(model_priors_poisson): 

    # generate sampled indices from the prior 
    mi_test = (np.random.rand(ntest) > model_prior_poisson) * np.ones(ntest)
    n_nb = int(mi_test.sum())
    n_poi = ntest - n_nb

    # generate data 
    params_poi = prior_lambda.rvs(size=n_poi)
    params_nb = np.vstack((prior_k.rvs(size=n_nb), 
                           prior_theta.rvs(size=n_nb))).T

    data_poi = model_poisson.gen(params_poi)
    data_nb = model_nb.gen(params_nb)

    x_test = np.vstack((data_poi, data_nb))
    sx_test = calculate_stats_toy_examples(x_test)
    sx_test_zt, _ = normalize(sx_test, training_norm)

    # predict 
    posteriors = model.predict(sx_test_zt)
    # get prob of poisson 
    model_freq_poisson[ii] = posteriors[:, 0].sum() / ntest
    
    # get true posterior probs 
    
    model_freq_poisson_true[ii] = 

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
ax.plot(model_priors_poisson, model_freq_poisson, '-o', label='predictions')
ax.plot(model_priors_poisson, model_priors_poisson, label='identity')
ax.set_ylabel(r'mean $p(M_{Poisson} | x_o)$')
ax.set_xlabel(r'prior $p(M_{Poisson})$')
ax.grid()
ax.legend()
plt.tight_layout();

In [None]:
fn = time_stamp + 'prior_checks_k2_{}.png'.format(int(k2))
fig.savefig(os.path.join('../figures', fn), dpi=300)