In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy
import torch
import torch.nn as nn
import os
import time

from random import shuffle
from scipy.stats import gamma, beta, nbinom, poisson
from scipy.special import gammaln, betaln
from torch.autograd import Variable
import sys 
sys.path.append('../')
from utils import *

%matplotlib inline

mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 17
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 16
mpl.rcParams['figure.figsize'] = (15, 5)

## Generate Poisson data, compare Poisson vs. negBin model

I will train a MDN to approximate the posterior over model indices. As models I use a Poisson with Gamma prior and a negative Binomial sampled from a Poisson-Gamma mixture with Gamma priors on the Gamma shape and scale parameters. 

As training data I will generate a large data set containing samples from both models generated with the corresponding priors. The MDN gets the mean and variance of the data set as input and outputs a probability vector over models. 

### Controlling difficulty via over / under dispersion 
The prior parameters of the conjugate Gamma prior of the Poisson model and the two Gamma priors of the NB model are chosen such that on average the data sets generated by the models have idenitcal sample means. The difficulty therefore arises from the amount of overdispersion in the NB model: The model comparison problem becomes easier to solve as the variance in the samples form the NB model is larger than the mean. 

In [None]:
sample_size = 100
n_samples = 10000
seed = None

# set prior parameters 
# set the shape or scale of the Gamma prior for the Poisson model
k1 = 9.0 
# set the shape and scale of the prior on the shape of the Gamma for the mixture to be broad 
theta2 = 2.0
k2 = 5.
# set the shape and scale of the prior on the scale of the Gamma for the mixture to be small 
# this will make the variance and could be the tuning point of the amount of overdispersion / difficulty
theta3 = 1.0 
k3 = 1

# then the scale of the Gamma prior for the Poisson is given by 
theta1 = (k2 * theta2 * k3 * theta3) / k1
print(theta1, k1)

# get analytical means 
mean_ana_poi = k1 * theta1
mean_ana_nb = k2 * k3 * theta2 * theta3

# set the priors 
prior1 = scipy.stats.gamma(a=k1, scale=theta1)
prior2 = scipy.stats.gamma(a=k2, scale=theta2)
prior3 = scipy.stats.gamma(a=k3, scale=theta3)

In [None]:
# generate a large data set for training 

X = []
thetas = []
m = []

for sample_idx in range(n_samples): 
    
    # sample model index 
    m.append(int(np.round(np.random.rand())))
    
    if m[sample_idx] == 0: 
        # sample poisson 
        theta, x = sample_poisson(prior1, 1, sample_size)
    if m[sample_idx] == 1: 
        # sample poisson 
        theta, x = sample_poisson_gamma_mixture(prior2, prior3, 1, sample_size)

    # calculate mean and var as summary stats 
    X.append([np.mean(x), np.var(x)])
    thetas.append(theta)
    
X = np.array(X)
#thetas = np.array(thetas)
#m = np.array(m)

## Define network for fitting the model posterior

In [None]:
class MDN_psi(nn.Module):
    
    def __init__(self, ndim_input=2, ndim_output=2, n_hidden=5, n_components=1):
        super(MDN_psi, self).__init__()
        self.fc_in = nn.Linear(ndim_input, n_hidden)
        self.tanh = nn.Tanh()
        self.m_out = nn.Linear(n_hidden, ndim_output)

    def forward(self, x):
        out = self.fc_in(x)
        act = self.tanh(out)
        out_m = self.m_out(act)
        return out_m
    
def train_psi(X, Y, model, optim, lossfun, n_epochs=500, n_minibatch=50):
    dataset_train = [(x, y) for x, y in zip(X, Y)]
    
    losses = []

    for epoch in range(n_epochs): 
        bgen = batch_generator(dataset_train, n_minibatch)

        for j, (x_batch, y_batch) in enumerate(bgen):
            x_var = Variable(torch.Tensor(x_batch))        
            y_var = Variable(torch.LongTensor(y_batch)).view(n_minibatch)
            
            (out_act) = model(x_var)
            loss = lossfun(out_act, y_var)
            
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            losses.append(loss.data[0])

        if (epoch + 1) % 100 == 0:
            print("[epoch %04d] loss: %.4f" % (epoch + 1, loss.data[0]))
    
    return model, optim, losses

## Generate a large data set of triplets (m, theta, sx)

Then separate it into sets for model 1 and model 2 and train the phi networks separately. 

In [None]:
# normalize 
X, norm = normalize(X)

In [None]:
n_inputs = 2
model = MDN_psi(ndim_input=n_inputs, n_hidden=10)
optim = torch.optim.Adam(model.parameters(), lr=0.001)
lossfun = nn.CrossEntropyLoss()

model_psi, optim_psi, losses = train_psi(X, m, model, optim, lossfun, n_epochs=500, n_minibatch=20)

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(losses)
plt.xlabel('iterations')
plt.ylabel('loss')
plt.title('Loss');

## Visualize input-output function of the network

In [None]:
x, y = np.meshgrid(np.logspace(-1, 2, 100), np.logspace(-1, 3, 100))

In [None]:
ppoi_mat = np.zeros((100, 100))
softmax = nn.Softmax(dim=0)

for i in range(x.shape[0]): 
    for j in range(x.shape[0]): 
        stats_o, norm = normalize(np.array([x[i, j], y[i, j]]), norm)        
        (out_act) = model(Variable(torch.Tensor(stats_o)))
    
        # in this vector, index 0 is Poi, index 1 is NB
        posterior_probs = softmax(out_act).data.numpy()
        ppoi_mat[i, j] = posterior_probs[0]

In [None]:
plt.imshow(ppoi_mat, origin='lower', extent=[x.min(), x.max(), y.min(), y.max()], aspect='auto')
plt.xlabel('Mean')
plt.ylabel('Variance')
plt.colorbar(label='P(Poisson)');

## Compare network predictions to analytical posterior probabilities