# Data Preprocessing for Experiment

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# Behavioural data path
path = r'C:\Users\L\Google Drive\PhD\experiments\ATTMEM\ATTMEM_MRI\data'

In [3]:
start_time = datetime.now() # start clock

files = [f'{path}\\{f}' for f in os.listdir(path) if f.split('.')[-1] == 'log'] # files in list

### Aggregation

In [4]:
print("Aggregating data...")
encoding_files = []
retrieval_files = []

for f in files:
    
    # Parse the filename
    fname = f.split('\\')[-1].split('.')[0]
    fparts = fname.split('_')
    subject_number, exp_version = [int(p) for p in fparts[0].split('-')]
    experiment_phase = fparts[2]
    block_number = int(fparts[-1])
    
    # Get the file write date & time
    with open(f) as fp:
        for i, line in enumerate(fp):
            if i == 1:
                if 'Logfile written' in line:
                    date, time = line.strip().split(' ')[-2:]
                break
             
    df = pd.read_csv(f, sep='\t', skiprows=3)
    
    df['write_time'] = date + ' ' + time
    df['write_time'] = pd.to_datetime(df['write_time'])
    df['version'] = exp_version
    df['phase'] = experiment_phase
    df['block'] = block_number
    
    # Strip out first few rows until first stimulus event
    start_idx = df.loc[(df['Event Type']=='Picture') & (df['Code']!='GETTING READY')].index[0]
    
    # We also need to subtract out the first pulse if it is MRI
    if 'Pulse' in df['Event Type'].unique():
        first_pulse_time = df.loc[df['Event Type'] == 'Pulse']['Time'].values[0]
        df['onset'] = df['Time'] - first_pulse_time
    else:
        df['onset'] = df['Time'] - df.loc[df.index==start_idx]['Time'].values[0]

    
    df = df.iloc[start_idx:]
    
    
    if experiment_phase == 'encoding':
        encoding_files.append(df)
    elif experiment_phase == 'retrieval':
        retrieval_files.append(df)

# Concatenate all files together
df1 = pd.concat(encoding_files)
df2 = pd.concat(retrieval_files)

Aggregating data...


In [5]:
df1.head()

Unnamed: 0,Subject,Trial,Event Type,Code,CONDITION(num),SCREEN_POSITION(num),IMAGE_ID(num),TARGET_NAME(str),DIST_1_NAME(str),DIST_2_NAME(str),...,Uncertainty.1,ReqTime,ReqDur,Stim Type,Pair Index,write_time,version,phase,block,onset
22,4.0,23.0,Picture,"1,3,11,penguin,cardboardbox,jewelrybox01b,straw",1.0,3.0,11.0,penguin,cardboardbox,jewelrybox01b,...,1.0,10000.0,30000.0,hit,25.0,2017-07-17 12:06:58,1,encoding,1,15920.0
23,4.0,23.0,Pulse,99,,,,,,,...,,,,,,2017-07-17 12:06:58,1,encoding,1,25120.0
24,4.0,23.0,Response,1,,,,,,,...,,,,,,2017-07-17 12:06:58,1,encoding,1,40440.0
25,4.0,23.0,Pulse,99,,,,,,,...,,,,,,2017-07-17 12:06:58,1,encoding,1,50080.0
26,4.0,24.0,Pulse,99,,,,,,,...,,,,,,2017-07-17 12:06:58,1,encoding,1,75040.0


In [6]:
df2.head()

Unnamed: 0,Subject,Trial,Event Type,Code,CONDITION(num),IMAGE_NUMBER(num),IMAGE_NAME(str),Time,TTime,Uncertainty,...,Uncertainty.1,ReqTime,ReqDur,Stim Type,Pair Index,write_time,version,phase,block,onset
15,4.0,16.0,Picture,"3,12,cane",3.0,12.0,cane,283597.0,10164.0,1.0,...,1.0,10000.0,25000.0,other,0.0,2017-07-17 12:42:21,1,retrieval,1,15371.0
16,4.0,16.0,Pulse,99,,,,293185.0,19752.0,0.0,...,,,,,,2017-07-17 12:42:21,1,retrieval,1,24959.0
17,4.0,16.0,Pulse,99,,,,318306.0,44872.0,0.0,...,,,,,,2017-07-17 12:42:21,1,retrieval,1,50080.0
18,4.0,17.0,Pulse,99,,,,343265.0,9180.0,0.0,...,,,,,,2017-07-17 12:42:21,1,retrieval,1,75039.0
19,4.0,17.0,Picture,"1,11,bench01",1.0,11.0,bench01,344249.0,10164.0,1.0,...,1.0,10000.0,25000.0,false_alarm,21.0,2017-07-17 12:42:21,1,retrieval,1,76023.0


### Data cleaning

In [7]:
# Fix up the column names
df1.columns = [c.replace(' ', '_').replace('(', '_').replace(')', '').lower() for c in df1.columns]
df2.columns = [c.replace(' ', '_').replace('(', '_').replace(')', '').lower() for c in df2.columns]
df1.columns = [c[:-4] if c[-4:]=='_str' else c for c in df1.columns]
df2.columns = [c[:-4] if c[-4:]=='_str' else c for c in df2.columns]

# Drop the MRI pulse trigs
df1 = df1.loc[df1['event_type'] != 'Pulse']
df2 = df2.loc[df2['event_type'] != 'Pulse']

Extract participant responses and relevant info

In [8]:
# Responses for df1 (encoding)
df1rs = df1.loc[df1['event_type'] == 'Response'][
    ['subject', 'phase', 'block', 'trial', 'code', 'onset', 'ttime']
    ].rename(columns={'onset':'response_time', 'ttime':'response_ttime'})
# Events for df1 (encoding)
df1es = df1.loc[df1['event_type'] == 'Picture']

# Merge them on common columns
df1 = pd.merge(
    df1es, df1rs, how='left', 
    left_on=['subject', 'phase', 'block', 'trial'],
    right_on=['subject', 'phase', 'block', 'trial']).sort_values(by=['subject', 'write_time'])

# Responses for df2 (retrieval)
df2rs = df2.loc[df2['event_type'] == 'Response'][
    ['subject', 'phase', 'block', 'trial', 'code', 'onset', 'ttime']
    ].rename(columns={'onset':'response_time', 'ttime':'response_ttime'})
# Events for df2 (retrieval)
df2es = df2.loc[df2['event_type'] == 'Picture']
# Merge them
df2 = pd.merge(df2es, df2rs, how='left', 
                left_on=['subject', 'phase', 'block', 'trial'],
                right_on=['subject', 'phase', 'block', 'trial']).sort_values(by=['subject', 'write_time'])


Re-reference trial numbers to start from 1

In [9]:
for s in df1['subject'].unique():
    sdf = df1.loc[df1['subject']==s]
    for b in sdf['block'].unique():
        sbdf = sdf.loc[sdf['block']==b]
        to_change = (df1['subject']==s)&(df1['block']==b)
        first = df1.loc[to_change]['trial'].iloc[0]
        df1.loc[to_change, 'trial'] = df1.loc[to_change]['trial'] - (first-1)

for s in df2['subject'].unique():
    sdf = df2.loc[df2['subject']==s]
    for b in sdf['block'].unique():
        sbdf = sdf.loc[sdf['block']==b]
        to_change = (df2['subject']==s)&(df2['block']==b)
        first = df2.loc[to_change]['trial'].iloc[0]
        df2.loc[to_change, 'trial'] = df2.loc[to_change]['trial'] - (first-1)


Re-code condition numbers

In [10]:
df1['condition_num'] = df1['condition_num'].astype(int)
df2['condition_num'] = df2['condition_num'].astype(int)

# Encoding phase
df1['attention_mode'] = np.where(df1['condition_num'] < 3, 'search', 'capture')
df1['target_category'] = np.where(df1['condition_num'] % 2 == 0, 'in', 'out')

# Retrieval phase
# Easier to do a different approach
temp = df2.replace({'condition_num' : { 1:'old_search_target', 2:'old_search_distractor',
                                        3:'old_capture_target', 4:'old_capture_distractor', 5:'new__'}})['condition_num']
temp = temp.str.split('_', expand=True).replace('', np.nan).rename(columns={0:'item_category', 
                                                                            1:'attention_mode', 
                                                                            2:'item_type'})
df2 = df2.merge(right=temp, left_index=True, right_index=True)

df2['response'] = df2['code_y'].astype(float)
df2['response'] = np.where(df2['response'].isin([1,2,3,4,5,6]), df2['response'], np.nan)

# df2['judgement'] = df2['response'].copy()
df2['judgement'] = df2['response'].replace({1: 'high_new',
                                  2: 'mid_new',
                                  3: 'low_new',
                                  4: 'low_old',
                                  5: 'mid_old',
                                  6: 'high_old'})

split = df2['judgement'].str.split('_', expand=True)
df2['confidence'] = split[0]
df2['judgement'] = split[1]

df2.loc[(df2['item_category'] == 'old') & (df2['judgement']=='old'), 'response_class'] = 'hit'
df2.loc[(df2['item_category'] == 'old') & (df2['judgement']=='new'), 'response_class'] = 'miss'
df2.loc[(df2['item_category'] == 'new') & (df2['judgement']=='old'), 'response_class'] = 'false_alarm'
df2.loc[(df2['item_category'] == 'new') & (df2['judgement']=='new'), 'response_class'] = 'correct_reject'

# Dummify response_class
dummies = pd.get_dummies(df2['response_class'])
df2 = pd.concat([df2, dummies], axis=1)

In [11]:
# # Set the block name
df1['block_name'] = df1.replace({'block':{1:'animals', 2:'foods', 3:'techs', 4:'tools'}})['block']
# Sort by write time to get correct order
df1 = df1.sort_values(by=['subject', 'write_time', 'trial']).reset_index(drop=True)
df2 = df2.sort_values(by=['subject', 'write_time', 'trial']).reset_index(drop=True)

Correct the capture time onsets:

The code for the experiment incorrectly logged the onset of capture trials
as the return of the flicker stimulus. This flicker period
began at 150 ms post trial onset, and lasted for 150 ms. Therefore the 
following columns need correcting by 300 ms:
    onset: stimulus onset time relative to start of experiment
    response_time: response time relative to start of experiment
    response_ttime: response time relative to start of trial
    duration: duration of the trials (increase them by 300 ms)

In [12]:
search_mask = df1['attention_mode'] == 'search'
capture_mask = df1['attention_mode'] == 'capture'

example_search = df1.loc[search_mask]['duration'].iloc[0]
example_capture = df1.loc[capture_mask]['duration'].iloc[0]
tdiff = example_search - example_capture
# Fix attention times here
# Correct onset column
df1.loc[capture_mask, 'onset'] = df1.loc[capture_mask]['onset'] - tdiff # 3000 microseconds.
# Correct the duration col
df1.loc[capture_mask, 'duration'] = df1.loc[capture_mask]['duration'] + tdiff
# Correct response_time col
df1.loc[capture_mask, 'response_time'] = df1.loc[capture_mask]['response_time'] - tdiff
# Correct the response_ttime col
df1.loc[capture_mask, 'response_ttime'] = df1.loc[capture_mask]['response_ttime'] - tdiff # INCORRECT!!! MUST BE `+ tdiff`!!!

Estimate the time taken to perform the visual search and store important cols

In [13]:
df1.loc[capture_mask, 'processing_capture'] = df1.loc[capture_mask]['response_ttime']-tdiff
df1['processing_capture'] = df1['processing_capture'].astype(float)
df1['median_capture_rt'] = df1.groupby(
    ['subject', 'block', 'attention_mode']
    )['processing_capture'].transform(np.median)
df1['median_capture_rt'] = df1.groupby(
    ['subject', 'block'])['median_capture_rt'].transform('median')

# Do the same for search trials
df1.loc[search_mask, 'processing_search'] = df1.loc[search_mask]['response_ttime']
df1['processing_search'] = df1['processing_search'].astype(float)
df1['median_search_rt'] = df1.groupby(
    ['subject', 'block', 'attention_mode']
    )['processing_search'].transform(np.median)
df1['median_search_rt'] = df1.groupby(
    ['subject', 'block'])['median_search_rt'].transform('median')

# Collapse them
df1['median_rts'] = np.where(search_mask, df1['median_search_rt'], df1['median_capture_rt'])

# Subtract out the capture response time
df1['search_duration_estimate'] = df1['median_search_rt'] - df1['median_capture_rt']
# and correct the search processing duration
df1['processing_search'] = df1['processing_search'] - df1['search_duration_estimate']
# So now we have the PROCESSING time upon detection of the target...
#   as well as the estimate of how long (median) it takes to search for the target.
df1['corrected_rt'] =  df1['processing_search'].fillna(0) + df1['processing_capture'].fillna(0)
df1 = df1.replace({'corrected_rt': {0: np.nan}})

Rename and then recode temporal columns to be in both ms and s formats

In [14]:
# default should be ms, so no need to append since we will never need μs...
df1_timecols = ['onset', 'time', 'ttime', 'uncertainty', 'duration',
                'uncertainty.1', 'reqtime', 'reqdur', 'response_time', 
                'response_ttime', 'processing_capture', 'median_capture_rt',
                'processing_search', 'median_search_rt', 'median_rts', 
                'search_duration_estimate', 'corrected_rt']
df2_timecols = df1_timecols[:10]
df1_to_seconds = ['onset', 'time', 'response_ttime', 'corrected_rt']
df2_to_seconds = df1_to_seconds[:-1]

# Apply the changes
df1[df1_timecols] = df1[df1_timecols].apply(lambda x: x.astype(float) / 10)
df2[df2_timecols] = df2[df2_timecols].apply(lambda x: x.astype(float) / 10)

for c in df1_to_seconds:
    df1[f'{c}_s'] = df1[c].astype(float) / 1000
for c in df2_to_seconds:
    df2[f'{c}_s'] = df2[c].astype(float) / 1000


Collect the subsequent memory data from the retrieval dataframe and attach it to encoding dataframe

In [15]:
enc_stim = df1[['subject', 'block_name', 'trial', 'attention_mode', 
            'target_name', 'dist_1_name', 'dist_2_name', 
            'dist_3_name', 'response_ttime', 'corrected_rt']]
enc_stim = pd.melt(frame=enc_stim,
                   id_vars=['subject', 'block_name', 'attention_mode', 'trial', 'response_ttime', 'corrected_rt'],
                   value_vars=['target_name', 'dist_1_name', 'dist_2_name', 'dist_3_name'],
                   var_name='image_type',
                   value_name='image_name')
# merge retrieval to encoding
ret_stim = df2[['subject', 'block', 'item_type', 'image_name', 'response_class', 'hit', 'miss', 'response_ttime']]

enc_stim = enc_stim.rename(columns={'response_ttime':'encoding_rt'})
ret_stim = ret_stim.rename(columns={'response_ttime':'retrieval_rt'})

smdf = enc_stim.merge(right=ret_stim, on=['subject', 'image_name'])

all_subject_encoding_data = []
# loop over each subject
for s in df1['subject'].unique():
    # create subject views to update - this has to be on a subject-by-subject basis
    # because subjects see the same stimuli, so it won't work otherwise.
    sdf1 = df1.loc[df1['subject']==s].copy()
    sdf2 = df2.loc[df2['subject']==s].copy()
    # create subsequently dropped columns and populate it with the dummy.
    sdf1.loc[~sdf1['target_name'].isin(sdf2['image_name']), 'subsequently_dropped'] = 1
    sdf1.loc[sdf1['target_name'].isin(sdf2['image_name']), 'subsequently_dropped'] = 0
    # Now, in the encoding df, if that encoding target is present in the 
    # retrieval df - AND if it was subsequently a hit, put subsequently remembered.
    sdf1.loc[sdf1['target_name'].isin(sdf2[(sdf2['item_type']=='target') &
             (sdf2['hit']==1)]['image_name']), 'subsequently_remembered'] = 1
    sdf1.loc[sdf1['target_name'].isin(sdf2[(sdf2['item_type']=='target') &
             (sdf2['miss']==1)]['image_name']), 'subsequently_remembered'] = 0 # didn't remember
    
    sdf1.loc[sdf1['target_name'].isin(sdf2[(sdf2['item_type']=='target') &
             (sdf2['hit']==1)]['image_name']), 'subsequently_forgot'] = 0 # didn't forget
    sdf1.loc[sdf1['target_name'].isin(sdf2[(sdf2['item_type']=='target') &
             (sdf2['miss']==1)]['image_name']), 'subsequently_forgot'] = 1
    
    all_subject_encoding_data.append(sdf1)

df1 = pd.concat(all_subject_encoding_data)

# Create final column describing the subsequent status
df1.loc[df1['subsequently_dropped']==1, 'subsequent_status'] = 'dropped'
df1.loc[df1['subsequently_remembered']==1, 'subsequent_status'] = 'remembered'
df1.loc[df1['subsequently_forgot']==1, 'subsequent_status'] = 'forgotten'

Do some final clearing up of column order and names

In [16]:
# to_remove = ['event_type', 'code_x', 'image_id_num', 'uncertainty', 'uncertainty.1', 'reqtime', 'reqdur', 'pair_index','code_y']

# print(df1.columns[~df1.columns.isin(to_remove)]) # Print out cols for copy & paste for reordering
keep_cols = ['write_time','subject', 'version', 'phase', 'block', 'block_name', 'trial', 'condition_num', 
             'screen_position_num', 'attention_mode', 'target_category',
             'target_name', 'dist_1_name', 'dist_2_name', 'dist_3_name', 
             'subsequently_dropped', 'subsequently_remembered', 'subsequently_forgot', 'subsequent_status',
             'time', 'onset', 'ttime', 'duration', 'stim_type','response_time', 'response_ttime',
             'processing_capture', 'median_capture_rt', 'processing_search', 'median_search_rt',
             'median_rts', 'search_duration_estimate', 'corrected_rt', 'time_s', 'onset_s',
             'response_ttime_s', 'corrected_rt_s']
df1 = df1[keep_cols]
# Rename a couple cols
df1 = df1.rename(columns={'stim_type': 'response_class',
                          'screen_position_num': 'target_screen_pos'})

# Do the same stuff for the retrieval df
keep_cols = ['write_time', 'subject', 'version', 'phase', 'block', 'trial', 'condition_num',
             'item_category', 'attention_mode', 'item_type', 'time', 'time_s', 'onset', 'onset_s', 'ttime', 'duration',
             'response', 'judgement', 'confidence', 'response_class',
             'hit', 'miss', 'correct_reject', 'false_alarm', 
             'response_time', 'response_ttime', 'response_ttime_s']
df2 = df2[keep_cols]
# df2 = df2.rename()

end_time = datetime.now()
print(f"Finished preprocessing.\n\tDuration: {end_time - start_time}")


Finished preprocessing.
	Duration: 0:00:05.205409


In [17]:
df1.head()

Unnamed: 0,write_time,subject,version,phase,block,block_name,trial,condition_num,target_screen_pos,attention_mode,...,median_capture_rt,processing_search,median_search_rt,median_rts,search_duration_estimate,corrected_rt,time_s,onset_s,response_ttime_s,corrected_rt_s
0,2017-07-17 12:06:58,4.0,1,encoding,1,animals,1.0,1,3.0,search,...,1938.3,2444.7,2962.1,2962.1,1023.8,2444.7,56.7695,1.592,3.4685,2.4447
1,2017-07-17 12:06:58,4.0,1,encoding,1,animals,2.0,4,3.0,capture,...,1938.3,,2962.1,1938.3,1023.8,2475.5,63.1346,7.6572,2.7754,2.4755
2,2017-07-17 12:06:58,4.0,1,encoding,1,animals,3.0,2,2.0,search,...,1938.3,,2962.1,2962.1,1023.8,,69.8996,14.7221,,
3,2017-07-17 12:06:58,4.0,1,encoding,1,animals,4.0,4,3.0,capture,...,1938.3,,2962.1,1938.3,1023.8,2672.4,76.2647,20.7873,2.9723,2.6724
4,2017-07-17 12:06:58,4.0,1,encoding,1,animals,5.0,2,1.0,search,...,1938.3,1938.3,2962.1,2962.1,1023.8,1938.3,82.5299,27.3524,2.9621,1.9383


In [18]:
df2.head()

Unnamed: 0,write_time,subject,version,phase,block,trial,condition_num,item_category,attention_mode,item_type,...,judgement,confidence,response_class,hit,miss,correct_reject,false_alarm,response_time,response_ttime,response_ttime_s
0,2017-07-17 12:42:21,4.0,1,retrieval,1,1.0,3,old,capture,target,...,,,,0,0,0,0,,,
1,2017-07-17 12:42:21,4.0,1,retrieval,1,2.0,1,old,search,target,...,old,high,hit,1,0,0,0,9981.9,3396.0,3.396
2,2017-07-17 12:42:21,4.0,1,retrieval,1,3.0,1,old,search,target,...,old,mid,hit,1,0,0,0,16287.9,3136.9,3.1369
3,2017-07-17 12:42:21,4.0,1,retrieval,1,4.0,4,old,capture,distractor,...,new,high,miss,0,1,0,0,24139.9,2924.3,2.9243
4,2017-07-17 12:42:21,4.0,1,retrieval,1,5.0,1,old,search,target,...,old,mid,hit,1,0,0,0,31121.9,3341.2,3.3412
