# Calculate posterior predictions
$$
\newcommand{\given}{\vert}
\newcommand{\text}{\mathrm{text}}
\newcommand{\xtext}{w_1, w_2 \ldots w_n}
$$    

The following calculates the posterior prediction over words conditioned a text.

Given a text $\text$, the posterior predictive distribution is, informally speaking, the distribution over words that are consistent with the discourse topics of the $\text$. It is calculated as follows:
$$
\begin{align}
\mathrm{P}(w \given \phi, \text, a, m) &= \int \mathrm{P}(w \given \phi, \pi) \mathrm{P}(\pi \given \text, a, m) d\pi,\\
&= \int \big[ \sum_{\{x\}} \mathrm{P}(w \given \phi, x)\mathrm{P}(x \given \pi) \big] \mathrm{P}(\pi \given \text, a, m) d\pi
\end{align}
$$
where $\mathrm{P}(\pi \given \text, a, m)$ is the posterior distribution over topic distributions of text $\text$ and $\phi$ is the set of $K$ component topics and $a$, $m$ are the hyper-parameters of the Dirichlet prior over the per document mixing distribution.
 

In [139]:
import os
import errno
import utils
import numpy

from utils import topicmodels, utils

from itertools import cycle

import cPickle as pickle

import datetime
from random import shuffle

## Download files for the topic model

Get data and some MCMC state samples for a HDPMM topic model. For each file, we provide its sha256 hash to check its integrity. If the files are already downloaded, the `curl` will not try to redownload them, but will just check their integrity. If the downloaded files are bz2 compressed, they will be uncompressed unless uncompressed versions exists already in the `cache_directory`.

In [2]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '_cache'

filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'corpus' : [('bnc_78723408_250_500_49328.npz.bz2', 
                 'b9d828f7697871e01a263b8f3978911c70ff45cab9af4c86fbb43c3baef969d9')],
    'mcmc_samples' : [('hdptm_061216085831_7090_state_12946.npz.bz2', 
                       '9ba9850ff51fd60b679fd2af85cbaa4b3d69a2f31f4a0705475c0fffe3374330')]
}

utils.curl(url_root, 
                 filenames['experiment_cfg'] + filenames['corpus'] + filenames['mcmc_samples'], 
                 cache=cache_directory,
                 verbose=False)

Now load up the corpus and one of the state samples.

In [3]:
corpus_data = utils.loadnpz(filenames['corpus'][0][0], 
                               cache=cache_directory,
                               verbose=False)

state = utils.loadnpz(filenames['mcmc_samples'][0][0],
                         cache=cache_directory,
                         verbose=False)

In [4]:
texts = topicmodels.get_experiment_texts('Brismo.cfg', cache=cache_directory)

In [5]:
model = topicmodels.PosteriorPredictive(corpus_data, state, verbose=True)

In [6]:
use_cached_result = True
cached_result = 'posterior_predictions.2017.01.20.1484887827.pkl'
    
with open('_cache/%s' % cached_result, 'rb') as f:
    posterior_predictions = pickle.load(f)

In [161]:
def _posteriorprediction2str(text_name, vocab, K=10, f=lambda arg: arg):

    words = utils.tokenize(texts[text_name])

    p = {}
    for k in numpy.flipud(posterior_predictions[text_name].argsort())[:K]:
        p[k] = f(posterior_predictions[text_name][k])
        
    max_val = 1 + int(max(p.values()))
        
    results = []
    for k, pk in p.items():
        
        if vocab[k] in words:
            s = r'{\fontsize{%2.2f}{%d}\selectfont %s}' % (pk, max_val, vocab[k])
        else:
            s = r'{\fontsize{%2.2f}{%d}\selectfont \textit{%s}}' % (pk, max_val, vocab[k])
            
        results.append(s)
          
    shuffle(results)
    
    doc = r'''
    \begin{figure}
    \begin{center}
    \fbox{\begin{minipage}[t]{0.45\textwidth}
    {\footnotesize 
    %s
    }
    \centerline{\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}\adfast{3}}
    \begin{center}
    %s
    \end{center}
    \end{minipage}
    }
    \end{center}
    \end{figure}
    ''' % (texts[text_name], '\n'.join(results))
    
    return doc

def posteriorprediction2str(text_name):
    
    results = _posteriorprediction2str(text_name, 
                                       corpus_data['vocabulary'], 
                                       K=50, 
                                       f = lambda arg: 8*numpy.log(1000*arg))
    
    return results

In [162]:
for text_name in texts:
    with open('%s.tex' % text_name, 'w') as f:
        f.write(posteriorprediction2str(text_name))

In [163]:
! mv text*tex /home/andrews/gitdev/papers/aubin/TeX/include/