# Experimental data preprocessing - second stage

Integrate the recall and recognition data sets with the predictions of the three models (Bayesian language model, word associations form cooccurrrences model, word associations from word association norms model).

This process is quick. Total running time is around 30 seconds.

In [1]:
from __future__ import division

# Standard library imports
import os
import cPickle as pickle

# Third party imports
import pandas
import numpy
import configobj

# Local imports
from utils import datautils
from utils import topicmodels
from utils import utils

In [2]:
cache_directory = '../cache'
cache_fullpath = lambda path: os.path.join(cache_directory, path)

In [3]:
filenames = {
    'experiment_cfg' : [('Brismo.cfg',
                         '909d9f8de483c4547f26fb4c34b91e12908ab5c144e065dc0fe6c1504b1f22c9')],
    'vocabulary' : [('bnc_vocab_49324.txt',
                     'ecf66c77121cf67e416580cf5cc0853bd1813dcfd946298723134e547324cb6b')],
    'recall_results' : [('brisbane_06b643a_recall_results.pkl',
                         'a94d812373123b9a8b1eac848276e8ffc6a563ebca71ff2bf5adc97c825cbc14')],
    'recognition_results' : [('brisbane_06b643a_recognition_results.pkl',
                              'e5680ff9853133af8f4d6d7d96382ee7d1698748289b0c77a2ca20fb123c71c3')],
    'cooccurrence_predictions' : [('word_associates_from_cooccurrence_statistics.pkl',
                                   'efbcb9bae13142296ed164335313f69e380ec49811e885ce3bc4351a10cd2889')],
    'posterior_predictions' : [('posterior_predictions.2290.20202.pkl',
                                'e0941816a08af95379a291af9df93885fcae613e068cc6f6e051e39b78cf2742')],
    'association_predictions' : [('word_associates_from_association_norms.pkl',
                                  'd40823d2ed0527b3164703ec828869af4e6613dc43a4645df8ee0124f70fc364')],
    'corpus_data' : [('bnc_texts_78639361_183975_251_499.npz',
                      '976d2ba53ecbacd092df21c4c04adf47d033fec3901e884cce69ca66ec280831')],
}

utils.verify_cache_files(filenames['experiment_cfg'] +\
                         filenames['vocabulary'] +\
                         filenames['recall_results'] +\
                         filenames['recognition_results'] +\
                         filenames['cooccurrence_predictions'] +\
                         filenames['posterior_predictions'] +\
                         filenames['association_predictions'] +\
                         filenames['corpus_data'],
           cache=cache_directory,
           verbose=False)

## Load up vocab

In [4]:
vocabulary = open(cache_fullpath(filenames['vocabulary'][0][0])).read().split()
vocab = datautils.Vocab(vocabulary)

## Load up recall data

In [5]:
Df = {}
Df['recall'] = pandas.read_pickle(cache_fullpath(filenames['recall_results'][0][0]))

In [6]:
Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,accuracy,response
0,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,apparently,True,11-apparently
1,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,there,True,11-there
2,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,is,True,11-is
3,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,no,True,11-no
4,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,case,True,11-case


Filter data frame by only responses where the recalled word is in the vocabulary.

In [7]:
I = map(lambda word: True if word in vocabulary else False, Df['recall']['word'])
Df['recall'] = Df['recall'][I]

In [8]:
Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,accuracy,response
0,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,apparently,True,11-apparently
5,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,law,True,11-law
9,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,directive,True,11-directive
11,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,faith,True,11-faith
12,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,family,True,11-family


## Load up recognition data

In [9]:
Df['recognition'] = pandas.read_pickle(cache_fullpath(filenames['recognition_results'][0][0]))

Df['recognition'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,expected,order,hit,response,correct,rt,stimulus
0,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,purple,True,0,True,True,True,1.002,45-purple
1,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,tastefully,False,1,True,False,True,0.917,45-tastefully
2,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,cataract,True,2,True,True,True,1.199,45-cataract
3,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,sack,True,3,True,True,True,0.71,45-sack
4,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,relic,False,4,True,False,True,1.04,45-relic


## Load up cooccurrence probabilities


In [10]:
with open(cache_fullpath(filenames['cooccurrence_predictions'][0][0])) as f:
    cooccurrence_predictions = pickle.load(f)

Confirm that for every recalled word or recognition stimulus, we have a predicted probability. We ignore here the word 'dhow', which was one of the recognition memory test items for text 23. I'm not sure how that got in given that we aimed to restrict the test items to word in a standard vocabulary test.

In [11]:
def check_predictions_items(Df, predictions):
    
    '''Confirm that for every recalled word or recognition stimulus, 
    we have a predicted probability. We ignore here the word 'dhow', 
    which was one of the recognition memory test items for text 23. 
    I'm not sure how that got in given that we aimed to restrict 
    the test items to word in a standard vocabulary test.'''
    
    words = []
    texts = []
    for key in predictions.keys():
        text_id, word = key.split('-')
        words.append(word)
        texts.append(int(text_id))

    assert all(map(lambda recalled_word: recalled_word in words, Df['recall']['word']))
    
    for word in Df['recognition']['word']:
        try:
            assert word in words
        except AssertionError:
            assert word == u'dhow', word
            
    return True
            
assert check_predictions_items(Df, cooccurrence_predictions)

Add the cooccurrence predictions to the recall data frame.

In [12]:
Df['recall']['cooccurrence.predictions'] = map(cooccurrence_predictions.get, 
                                               Df['recall']['response'])

Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,accuracy,response,cooccurrence.predictions
0,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,apparently,True,11-apparently,0.000236
5,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,law,True,11-law,0.002569
9,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,directive,True,11-directive,0.001037
11,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,faith,True,11-faith,0.000398
12,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,family,True,11-family,0.001191


Do the same for the recognition results.

In [13]:
Df['recognition']['cooccurrence.predictions'] = map(cooccurrence_predictions.get, 
                                                    Df['recognition']['stimulus'])

Df['recognition'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,expected,order,hit,response,correct,rt,stimulus,cooccurrence.predictions
0,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,purple,True,0,True,True,True,1.002,45-purple,9e-05
1,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,tastefully,False,1,True,False,True,0.917,45-tastefully,1e-05
2,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,cataract,True,2,True,True,True,1.199,45-cataract,1.1e-05
3,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,sack,True,3,True,True,True,0.71,45-sack,0.000287
4,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,relic,False,4,True,False,True,1.04,45-relic,1.3e-05


## Load the model's posterior predictions

In [14]:
with open(cache_fullpath(filenames['posterior_predictions'][0][0]),'rb') as f:
    posterior_predictions = pickle.load(f)

Now, we have to extract the posterior probability of each 'text-word' stimulus item from the `posterior_predictions` matrix. Remember, the `posterior_predictions` matrix is $J \times V$, where $J$ and $V$ are the number of texts and words in the vocabulary, respectively. We'll then check that we have posterior predictions for all items in the behavioural data.

In [15]:
corpus_data = numpy.load(cache_fullpath(filenames['corpus_data'][0][0]))

word2index = {w:i for i,w in enumerate(corpus_data['vocabulary'])}
index2word = {i:w for i,w in enumerate(corpus_data['vocabulary'])}

posterior_predictions_items = {}
for item in list(Df['recall']['response'].unique())\
            + list(Df['recognition']['stimulus'].unique()):
    
    try:
        
        n, word = item.split('-')
        w_i = word2index[word]
        text_id = 'text_%d' % (int(n)-1)
        posterior_predictions_items[item] = posterior_predictions[text_id][w_i]
        
    except KeyError:
        assert item == '23-dhow', item

assert check_predictions_items(Df, posterior_predictions_items)

Add the posterior predictions to the recall memory data set.

In [16]:
Df['recall']['posterior.predictions'] = map(posterior_predictions_items.get, Df['recall']['response'])

Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,accuracy,response,cooccurrence.predictions,posterior.predictions
0,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,apparently,True,11-apparently,0.000236,0.000285
5,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,law,True,11-law,0.002569,0.019769
9,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,directive,True,11-directive,0.001037,0.0037
11,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,faith,True,11-faith,0.000398,0.00077
12,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,family,True,11-family,0.001191,0.001051


And do the same for the recognition memory data.

In [17]:
Df['recognition']['posterior.predictions'] = map(posterior_predictions_items.get, 
                                                    Df['recognition']['stimulus'])

Df['recognition'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,expected,order,hit,response,correct,rt,stimulus,cooccurrence.predictions,posterior.predictions
0,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,purple,True,0,True,True,True,1.002,45-purple,9e-05,0.000665
1,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,tastefully,False,1,True,False,True,0.917,45-tastefully,1e-05,1e-06
2,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,cataract,True,2,True,True,True,1.199,45-cataract,1.1e-05,5.9e-05
3,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,sack,True,3,True,True,True,0.71,45-sack,0.000287,0.000239
4,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,relic,False,4,True,False,True,1.04,45-relic,1.3e-05,4e-06


## Load up the associations data

In [18]:
with open(cache_fullpath(filenames['association_predictions'][0][0]), 'rb') as f:
    associations = pickle.load(f)

In [19]:
assert check_predictions_items(Df, associations)

Add the association predictions to the recall and recognition data frames.

In [20]:
Df['recall']['association.predictions'] = map(associations.get, 
                                              Df['recall']['response'])

Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,accuracy,response,cooccurrence.predictions,posterior.predictions,association.predictions
0,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,apparently,True,11-apparently,0.000236,0.000285,3e-06
5,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,law,True,11-law,0.002569,0.019769,0.003213
9,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,directive,True,11-directive,0.001037,0.0037,4e-06
11,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,faith,True,11-faith,0.000398,0.00077,0.000123
12,186a069,4ba33f7,29,Male,96a7502,True,11,60.165,family,True,11-family,0.001191,0.001051,0.001264


In [21]:
Df['recognition']['association.predictions'] = map(associations.get, 
                                                   Df['recognition']['stimulus'])

Df['recognition'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,expected,order,hit,response,correct,rt,stimulus,cooccurrence.predictions,posterior.predictions,association.predictions
0,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,purple,True,0,True,True,True,1.002,45-purple,9e-05,0.000665,0.00108761
1,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,tastefully,False,1,True,False,True,0.917,45-tastefully,1e-05,1e-06,4.035587e-07
2,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,cataract,True,2,True,True,True,1.199,45-cataract,1.1e-05,5.9e-05,2.421352e-06
3,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,sack,True,3,True,True,True,0.71,45-sack,0.000287,0.000239,0.0002474474
4,186a069,4ba33f7,29,Male,d69884d,True,45,62.805,relic,False,4,True,False,True,1.04,45-relic,1.3e-05,4e-06,6.658718e-06


## Write new data sets 

In [22]:
recall_results_filename = 'experiment_brisbane_recall_memory_tests_results.csv'
Df['recall'].to_csv(cache_fullpath(recall_results_filename))
assert utils.checksum(cache_fullpath(recall_results_filename))\
   == '0efd9b0f02fe0e963864ab798113a033a7f3ea6058cf8e77540d4d12f6fe1f74'

recognition_results_filename = 'experiment_brisbane_recognition_memory_tests.csv'
Df['recognition'].to_csv(cache_fullpath(recognition_results_filename))
assert utils.checksum(cache_fullpath(recognition_results_filename))\
   == 'e8094c1d5fea02a4be66342ab4f83390d13004c41afdf607cdfafcb745fb0a6f'