# Preprocessing experimental data prior to analysis

Download raw data for experiment *Brisbane* and process it to create two data-frames: one for recognition data and one for recall data.

In [1]:
import string
from utils import processing, utils
import cPickle as pickle

In [2]:
url_root = 'http://www.lawsofthought.org/shared'

cache_directory = '_cache'

fake_subject_file = [('fake_subject_uids.txt',
                      '04bfa8c11b999b371f24ca907c314d43064e42c23a1e0aa2025c797a4d454b66')]

utils.curl(url_root, 
                 fake_subject_file, 
                 cache=cache_directory,
                 verbose=False)

processing.fake_subject_uids = processing.get_fake_subject_uids('_cache/fake_subject_uids.txt')

In [3]:
data = processing.get_data('https://data.cognitionexperiments.org/06b643a')

In [4]:
sessions = data['ExperimentVersions'][0]['Sessions']

In [5]:
Df = {}
Df['recognition'] = processing.get_textrecognition_data(sessions)
Df['recall'] = processing.get_textrecall_data(sessions)

In [6]:
Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word
0,186a069,4ba33f7,29,Male,a961164,True,11,60.165,Apparently
1,186a069,4ba33f7,29,Male,a961164,True,11,60.165,There
2,186a069,4ba33f7,29,Male,a961164,True,11,60.165,is
3,186a069,4ba33f7,29,Male,a961164,True,11,60.165,no
4,186a069,4ba33f7,29,Male,a961164,True,11,60.165,case


In [7]:
Df['recognition'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word,expected,order,hit,response,correct,rt
0,186a069,4ba33f7,29,Male,9e+182,True,45,62.805,purple,True,0,True,True,True,1.002
1,186a069,4ba33f7,29,Male,9e+182,True,45,62.805,tastefully,False,1,True,False,True,0.917
2,186a069,4ba33f7,29,Male,9e+182,True,45,62.805,cataract,True,2,True,True,True,1.199
3,186a069,4ba33f7,29,Male,9e+182,True,45,62.805,sack,True,3,True,True,True,0.71
4,186a069,4ba33f7,29,Male,9e+182,True,45,62.805,relic,False,4,True,False,True,1.04


## Process the recognition data

In [8]:
Df['recognition'] = Df['recognition'].query('hit == True')

assert Df['recognition'][['expected', 'response', 'correct']].apply(lambda row: (row[0] == row[1]) == row[2], 1).all()

Df['recognition']['stimulus'] = Df['recognition'][['text', 'word']].apply(lambda x: str(x[0]) + '-' + x[1], axis=1)

Df['recognition']\
= Df['recognition'][['subject', 'slide', 'stimulus', 'text', 'word', 'expected', 'response', 'correct', 'rt']]

In [9]:
Df['recognition'].head()

Unnamed: 0,subject,slide,stimulus,text,word,expected,response,correct,rt
0,4ba33f7,9e+182,45-purple,45,purple,True,True,True,1.002
1,4ba33f7,9e+182,45-tastefully,45,tastefully,False,False,True,0.917
2,4ba33f7,9e+182,45-cataract,45,cataract,True,True,True,1.199
3,4ba33f7,9e+182,45-sack,45,sack,True,True,True,0.71
4,4ba33f7,9e+182,45-relic,45,relic,False,False,True,1.04


In [10]:
Df['recognition'].to_pickle('_cache/brisbane_06b643a_recognition_results.pkl')

## Process the recall data

In [11]:
Df['recall'].head()

Unnamed: 0,session,subject,age,sex,slide,completed,text,readingtime,word
0,186a069,4ba33f7,29,Male,a961164,True,11,60.165,Apparently
1,186a069,4ba33f7,29,Male,a961164,True,11,60.165,There
2,186a069,4ba33f7,29,Male,a961164,True,11,60.165,is
3,186a069,4ba33f7,29,Male,a961164,True,11,60.165,no
4,186a069,4ba33f7,29,Male,a961164,True,11,60.165,case


In [12]:
Df['recall']['word'] = map(string.lower, Df['recall']['word'])

Df['recall'] = Df['recall'][['session', 'subject', 'slide', 'text', 'word']]

In [13]:
Df['recall'].head()

Unnamed: 0,session,subject,slide,text,word
0,186a069,4ba33f7,a961164,11,apparently
1,186a069,4ba33f7,a961164,11,there
2,186a069,4ba33f7,a961164,11,is
3,186a069,4ba33f7,a961164,11,no
4,186a069,4ba33f7,a961164,11,case


In [14]:
Df['recall'].to_pickle('_cache/brisbane_06b643a_recall_results.pkl')