In text $j$, if there are $n_j$ words, there are $\frac{n_j \cdot (n_j - 1)}{2}$ pairs of words in total (where the pairs need not be cosecutive words in the text). Of all these pairs, there are $n_{jk} \cdot n_{jl}$ pairs consisting of words $w_k$ and $w_l$, and $\frac{n_{jk} \cdot (n_{jk} - 1)}{2}$ pair consisting of word $w_k$ repeated twice, where $n_{jk}$ and $n_{jl}$ are the number of times that word $w_k$ and $w_l$, respectively, occur in text $j$.

Across all of the texts in the corpus, the total number of cooccurrences of the words $w_k$ and $w_l$ is 
$$
C_{kl} = \begin{cases}
\sum_{j=1}^J n_{jk} \cdot n_{jl},\quad\text{if $k\neq l$}\\
\tfrac{1}{2} \sum_{j=1}^J n_{jk} \cdot (n_{jl}-1),\quad\text{if $k = l$}
\end{cases}
$$

In [46]:
%matplotlib inline
from __future__ import division

import configobj
import pandas
import numpy
import os
import cPickle as pickle
from scipy import sparse
from scipy.special import digamma
from utils import utils
from utils import datautils
from collections import defaultdict
from utils.datautils import tokenize
from matplotlib import pyplot

from sklearn.preprocessing import normalize

import warnings
warnings.filterwarnings('ignore')

In [2]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '../_cache'
cache_fullpath = lambda path: os.path.join(cache_directory, path)

filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'text-corpus' : [('bnc_texts_78723408_250_500.txt.bz2', 
                      'dd8806f51088f7c8ad6c1c9bfadb6680c44bc5fd411e52970ea9c63596c83d34')],
    'vocabulary' : [('bnc_vocab_49328.txt',
                     '55737507ea9a2c18d26b81c0a446c074c6b8c72dedfa782c763161593e6e3b97')]
}

utils.curl(url_root, 
                 filenames['experiment_cfg'] + filenames['text-corpus'] + filenames['vocabulary'], 
                 cache=cache_directory,
                 verbose=False)

memoranda = configobj.ConfigObj(cache_fullpath('Brismo.cfg'))['text_memoranda']

vocabulary = open(cache_fullpath('bnc_vocab_49328.txt')).read().split()
vocab = datautils.Vocab(vocabulary)

Df = {}
Df['recall'] = pandas.read_pickle(cache_fullpath('brisbane_06b643a_recall_results.pkl'))

recalled_words = sorted(set(Df['recall']['word'].values).intersection(vocabulary))

In [3]:
def text_to_words(text):
    return [word for word in utils.tokenize(text) 
            if word in vocab.word2index]

In [4]:
def get_random_words(K=5000, seed=10101):
    random = numpy.random.RandomState(seed)
    return [vocab.vocab[i] for i in random.permutation(len(vocab.vocab))[:5000]]

all_words = recalled_words[:] #+ get_random_words()

for text_name in memoranda:

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    text_words = text_to_words(memoranda[text_name]['text'])
    
    all_words.extend(inwords + outwords + text_words)
    
all_words = sorted(set(all_words).intersection(vocab.vocab))

In [5]:
cooccurrences = datautils.Cooccurrences('bnc_texts_78723408_250_500.txt.bz2', 
                                        cache=cache_directory,
                                        vocab=vocab)

In [6]:
P = normalize(cooccurrences.C, norm='l1', axis=1)

assert numpy.allclose(P.sum(1), 1.0)

In [9]:
p_mean = P.mean(0).A.flatten()
p_squared_mean = P.power(2).mean(0).A.flatten()

In [47]:
a = (p_mean - p_squared_mean)/(p_squared_mean - p_mean**2)

In [59]:
numpy.exp(numpy.log(a).mean())

2102.8007198751156