# Calculate predicted probabilities based on co-occurrence probabilities

We aim to calculate 
$$
\mathrm{P}(w_k \vert w_l) \triangleq \frac{ \mathrm{P}(w_k,w_l) }{\mathrm{P}(w_l)}, 
$$
which is the empirical probability of observing word $w_k$ in some linguistic context, e.g., a short text, given that we've observed $w_l$ there.

What is the probability of finding word $w_k$ and word $w_l$ in the same context, e.g. the same text? If we were to choose a text at random, and then choose a pair of words at random from that text, what is the probability of choosing the pair $w_k$ and $w_l$? If call the randomly chosen text text $j$, the probability of choosing the pair $w_k$ and $w_l$ at random is 
$$
\mathrm{P}(w_k,w_l\vert \textrm{text}=j) = \frac{2 n_{jk} n_{jl}}{n_j (n_j-1)} 
$$
where $n_{jk}$ is the number of occurrences of word $w_k$ in text $j$, $n_{jl}$ is the number of occurrences of word $w_l$ in text $j$, and $n_j$ is the total number of words in text $j$. If $w_k = w_l$ then the numerator above is $n_{jk} (n_{jk}-1)$. The total number of pairs in the text is $n_j (n_j-1)$ and of these, $2 n_{jk} n_{jl}$ are the words $w_k$ and $w_l$.  The total number of pairs in the corpus of $J$ document is
$$
\sum_{j=1}^J n_j (n_j-1)
$$
and the total number of pairs in the corpus that are $w_k$ and $w_l$ is
$$
\sum_{j=1}^j 2 n_{jk} n_{jl} 
$$
and so
$$
\mathrm{P}(w_k,w_l) = \frac{\sum_{j=1}^J n_j (n_j-1) }{ \sum_{j=1}^j 2 n_{jk} n_{jl}} 
$$

The total number of word pairs in text $j$ that contain $w_k$, which is $\mathrm{P}(w_k)$, is
$$
\mathrm{P}(w_k) = \sum_{l\neq k} 2 n_{jk} n_{jl} + n_{jk} (n_{jk}-1) = 2 n_{jk} (n-n_{jk}) + n_{jk} (n_{jk}-1)
$$


In [2]:
import configobj
import pandas
import cPickle as pickle
from utils import utils
from utils import datautils
from utils.datautils import tokenize

In [3]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '_cache'

filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'text-corpus' : [('bnc_texts_78723408_250_500.txt.bz2', 
                      'dd8806f51088f7c8ad6c1c9bfadb6680c44bc5fd411e52970ea9c63596c83d34')],
    'vocabulary' : [('bnc_vocab_49328.txt',
                     '55737507ea9a2c18d26b81c0a446c074c6b8c72dedfa782c763161593e6e3b97')]
}

utils.curl(url_root, 
                 filenames['experiment_cfg'] + filenames['text-corpus'] + filenames['vocabulary'], 
                 cache=cache_directory,
                 verbose=False)

In [4]:
memoranda = configobj.ConfigObj('_cache/Brismo.cfg')['text_memoranda']

In [5]:
vocabulary = open('_cache/bnc_vocab_49328.txt').read().split()
vocab = datautils.Vocab(vocabulary)

In [6]:
Df = {}
Df['recall'] = pandas.read_pickle('_cache/brisbane_06b643a_recall_results.pkl')

recalled_words = sorted(set(Df['recall']['word'].values).intersection(vocabulary))

In [7]:
def text_to_words(text):
    return [word for word in utils.tokenize(text) 
            if word in vocab.word2index]

In [8]:
all_words = recalled_words[:]
for text_name in memoranda:

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    text_words = text_to_words(memoranda[text_name]['text'])
    
    all_words.extend(inwords + outwords + text_words)

In [9]:
cooccurrences = datautils.Cooccurrences('bnc_texts_78723408_250_500.txt.bz2', 
                                        cache='_cache', 
                                        vocabulary_list=vocabulary, 
                                        target_words=all_words)

In [10]:
def get_conditional_probabilities(text_name):
    
    cooccurrences.init()
    
    conditional_probabilities = {}

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    for prime_word in text_to_words(memoranda[text_name]['text']):
        for target_word in inwords+outwords+recalled_words:
            conditional_probabilities[(target_word, prime_word)]\
            = cooccurrences.conditional_probability(target_word, prime_word)
            
    cooccurrences.deinit()
    
    return conditional_probabilities

In [None]:
from ipyparallel import Client

clients = Client()

clients.block = True

clients[:].push(dict(memoranda = memoranda, 
                     recalled_words=recalled_words, 
                     text_to_words=text_to_words,
                     tokenize=tokenize,
                     vocab=vocab,
                     cooccurrences=cooccurrences));

view = clients.load_balanced_view()

Note: The following parallel processing is *very* memory hungry. It requires around 6GB per processor. I am limited to 64GB, so I only use 8 cores to avoid any memory overflow. The whole thing takes around 3hrs to complete.

In [None]:
_conditional_probabilities = view.map(get_conditional_probabilities, memoranda.keys())

In [None]:
conditional_probabilities = {}
for _conditional_probabilities_i in _conditional_probabilities:
    conditional_probabilities.update(_conditional_probabilities_i)

In [None]:
with open('_cache/conditional_probabilities.pkl', 'wb') as f:
    pickle.dump(conditional_probabilities, f, protocol=2)

In [None]:
p = {}
for text_name in memoranda:

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    
    prime_words = text_to_words(memoranda[text_name]['text'])
    
    p[text_name] = {}
    for target_word in inwords+outwords+recalled_words:
        p[text_name][target_word] = 0.0
        for prime_word in prime_words:
            p[text_name][target_word] += conditional_probabilities[(target_word, prime_word)]
        p[text_name][target_word] /= len(prime_words)

In [None]:
cooccurrence_predictions = {}
for text_name in memoranda:
    
    _, n = text_name.split('_')
    n = int(n)+1
    
    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    
    for word in inwords+outwords+recalled_words:

        prob = p[text_name][word]
        cooccurrence_predictions[str(n) + '-' + word] = prob

In [None]:
with open('_cache/cooccurrence_predictions.pkl', 'wb') as f:
    pickle.dump(cooccurrence_predictions, f, protocol=2)