In [1]:
import configobj
import pandas
import cPickle as pickle
from utils import utils
from utils import datautils
from utils.datautils import tokenize

In [6]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '_cache'

filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'text-corpus' : [('bnc_texts_78723408_250_500.txt.bz2', 
                      'dd8806f51088f7c8ad6c1c9bfadb6680c44bc5fd411e52970ea9c63596c83d34')],
    'vocabulary' : [('bnc_vocab_49328.txt',
                     '55737507ea9a2c18d26b81c0a446c074c6b8c72dedfa782c763161593e6e3b97')]
}

utils.curl(url_root, 
                 filenames['experiment_cfg'] + filenames['text-corpus'] + filenames['vocabulary'], 
                 cache=cache_directory,
                 verbose=False)

In [7]:
memoranda = configobj.ConfigObj('_cache/Brismo.cfg')['text_memoranda']

In [8]:
vocabulary = open('_cache/bnc_vocab_49328.txt').read().split()
vocab = datautils.Vocab(vocabulary)

In [9]:
Df = {}
Df['recall'] = pandas.read_pickle('_cache/brisbane_06b643a_recall_results.pkl')

recalled_words = sorted(set(Df['recall']['word'].values).intersection(vocabulary))

In [10]:
def text_to_words(text):
    return [word for word in utils.tokenize(text) 
            if word in vocab.word2index]

In [11]:
all_words = recalled_words[:]
for text_name in memoranda:

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    text_words = text_to_words(memoranda[text_name]['text'])
    
    all_words.extend(inwords + outwords + text_words)

In [12]:
cooccurrences = datautils.Cooccurrences('bnc_texts_78723408_250_500.txt.bz2', 
                                        cache='_cache', 
                                        vocabulary_list=vocabulary, 
                                        target_words=all_words)

In [14]:
with open('_cache/conditional_probabilities.pkl', 'rb') as f:
    conditional_probabilities = pickle.load(f)

In [68]:
p = {}
for text_name in memoranda:

    inwords = memoranda[text_name]['inwords'].split(',')
    outwords = memoranda[text_name]['outwords'].split(',')
    
    prime_words = text_to_words(memoranda[text_name]['text'])
    
    p[text_name] = {}
    for target_word in inwords+outwords+recalled_words:
        p[text_name][target_word] = 0.0
        for prime_word in prime_words:
            p[text_name][target_word] += conditional_probabilities[(target_word, prime_word)]
        p[text_name][target_word] /= len(prime_words)

In [160]:
F = []
text_names = sorted(p.keys(), key=lambda arg: int(arg.split('_')[1]))
for text_name in text_names:
    f = []
    for word in recalled_words:
        f.append(p[text_name][word])
    F.append(f)

F = numpy.array(F)

F = numpy.c_[F, 1-F.sum(1)]

header = ','.join(recalled_words + ['ALTERNATIVE_WORD'])

M = [header]
for i,f in enumerate(F):
    M.append(text_names[i] + ',' + ','.join(map(str, f)))
M = '\n'.join(M)

with open('_cache/cooccurrences_predictions_of_recalled_words.csv', 'w') as f:
    f.write(M)