In [10]:
import pandas as pd
import glob
from ast import literal_eval

In [11]:
folder_name = "experiment2_norest"  # set to  experiment1_rest or experiment2_norest 
rest_condition = "norest" if "norest" in folder_name else "rest"
data_path = "../data/" + folder_name + "/full_dataset/*.csv"
exact_ages = pd.read_csv("../data/" + folder_name + "/exact_ages.csv")
csv_files = glob.glob(data_path)

In [12]:
def reformat_response(x):
    """
    Given a row of the dataframe, parse the response and return it as a list of [first stage choice, second stage choice]
    """
    try:
        if '[' not in x:
            return [x]
        else:
            return literal_eval(str(x))
    except:
        return [None,None]  

def is_optimal_stage(x, stage):
    """
    Given a row of the dataframe, return a boolean reflecting whether the participants first or second stage choice was optimal
    Stage: 0 for first stage, 1 for second stage
    """
    try:
        rewardMap = literal_eval(x.rewardMap)
        if x.isCatch or x.task_part != 'value training': return None
        return x.response0 == max(rewardMap, key=rewardMap.get).split(',')[0]
    except:
        return None

# Exclude participants and join data

In [13]:
filtered_data = []
excluded = []
for f in csv_files:
    df = pd.read_csv(f)
    subject = df.loc[0]["subject_id"]
    condition = df.loc[0]["assigned_condition"]

    # calculate exclusion criteria for each subject
    interactions = df['browser_interaction'][df['browser_interaction'].notnull()].iloc[-1].count("event")
    comprehension_retries = df.loc[(df['task_part'] == 'comp_question') & (df['correct'] == False)].shape[0]
    dots_missed = 0
    if rest_condition == "rest":
        dots_missed = df['numberMissed'].sum()
    catch_correct = df[(df['isCatch'] == True) & (df['task_part']!="catch practice")]["correct"].sum()
    task_timeouts = df[(df['trial_type'] == "revaluation") & (df['environment'] != 'space')]['timeout'].mean()
    memory_timeouts = df.loc[(df['trial_type'] == "memory-trial")]['timeout'].mean()
    test_timeouts = df[(df['task_part'] == "first stage test")]['timeout'].mean()
    block1_lasthalf_accuracy = df[(df['order'] == 1) & (df['task_part'] == 'value training') & (df['isCatch']==False)].tail(21)['correct'].mean()
    block2_lasthalf_accuracy = df[(df['order'] == 2) & (df['task_part'] == 'value training') & (df['isCatch']==False)].tail(21)['correct'].mean()
    
    # exclude subjects based on criteria
    if interactions > 20 or task_timeouts > 0.15 or memory_timeouts>0.15 or comprehension_retries > 4 or catch_correct<11 or block1_lasthalf_accuracy<0.75 or block2_lasthalf_accuracy<0.75 or (rest_condition == "rest" and dots_missed > 4):
        excluded.append({'subject':subject,'catch_correct':catch_correct, 'block1_second_half_accuracy':block1_lasthalf_accuracy, 'block2_second_half_accuracy':block2_lasthalf_accuracy, 'task_timeouts':task_timeouts, 'memory_timeouts':memory_timeouts, 'comprehension_retries':comprehension_retries, 'interactions':interactions, 'dots_missed': dots_missed})
    else:
        # keep only relevant columns
        filtered = df[['subject_id','task_part', 'rt', 'timeout', 'stimulus', 'response','environment','score', 'isCatch', 'correct', 'condition', 'order', 'rewardMap','trial_num', 'ground_truth', 'age']]
        filtered = filtered[filtered['task_part'].isin(['value training', 'revaluation','first stage test','second stage test', 'memory'])]
        
        # add task condition and age info
        filtered['subject_condition'] = condition
        filtered['rest'] = rest_condition
        filtered['age'] = float(exact_ages[exact_ages['subject_id'] == int(subject)].iloc[0]['Age'])

        # rename values for block_condition to Revaluation and Control
        filtered['condition'] = filtered['condition'].replace({'original': 'Control', 'revaluation': 'Revaluation'})

        # reformatting trial_num and response data
        filtered['trial_num'] = filtered['trial_num'].astype('Int64') // 2      
        response_list = filtered['response'].apply(lambda x: reformat_response(x)).values.tolist()
        response_df = pd.DataFrame(response_list).add_prefix('response')  
        filtered = filtered.reset_index().join(response_df).drop('response', axis=1)
        
        # compute is_optimal_first, is_optimal_second for each trial
        first = filtered.apply(lambda x: is_optimal_stage(x,0), axis=1)
        second = filtered.apply(lambda x: is_optimal_stage(x,1), axis=1)
        filtered = filtered.merge(first.rename("is_optimal_first"),left_index=True, right_index=True)
        filtered = filtered.merge(second.rename("is_optimal_second"),left_index=True, right_index=True)
        
        # add participants data to filtered_data
        filtered_data.append(filtered)

# concatenate all subjects data
filtered_data = pd.concat(filtered_data, ignore_index=True)

# add column for categorical age
filtered_data['categorical_age'] = pd.cut(filtered_data['age'], bins=[7,13,18,24], labels=['Children','Adolescents','Adults'])

# rename columns "order" to "block_order" and "condition" to "block_condition"
filtered_data = filtered_data.rename(columns={'order':'block_order', 'condition':'block_condition'})

# save data about excluded participants
excluded = pd.DataFrame(excluded)
excluded.to_csv('../data/' + folder_name + '/preprocessed/excluded.csv')

# Clean data to assess optimal choices during learning

In [14]:
# keep only non-catch learning trials
learning_data = filtered_data[(filtered_data['task_part']=='value training') & (filtered_data['isCatch']==False)].copy() 

# reset trial number after removing catch 
learning_data['trial_num'] = learning_data.groupby(['subject_id', 'block_condition']).cumcount() + 1    

 # get binned trial num for graphing smooth learning curves
learning_data['trial_bin'] = learning_data.groupby(['subject_id','block_order']).cumcount() // 5 * 5     

# keep only trials where participants did not time out        
learning_data = learning_data[learning_data['timeout']==False]    

# binary optimal score metric for performance
learning_data.to_csv('../data/' + folder_name + '/preprocessed/learning_data.csv')

# To assess age differences in first-stage vs second-stage choices

In [15]:
learning_data_by_stage = learning_data.copy().melt(
    id_vars=['subject_id','age','trial_num','block_condition','block_order'],
    value_vars=['is_optimal_first', 'is_optimal_second'],
    var_name='stage',
    value_name='optimal'
)

learning_data_by_stage['stage'] = learning_data_by_stage['stage'].replace({'is_optimal_first': 'First', 'is_optimal_second': 'Second'})
learning_data_by_stage.to_csv('../data/' + folder_name + '/preprocessed/learning_data_by_stage.csv')

# To assess optimal choices during relearning


In [16]:
relearning_data = filtered_data[filtered_data['task_part']=='revaluation'].copy()

# calculate trial number per second stage state
relearning_data['trial'] = relearning_data.groupby(['subject_id', 'stimulus'])['stimulus'].cumcount()

# keep only trials where participants did not time out        
relearning_data = relearning_data[relearning_data['timeout']==False]    

relearning_data.to_csv('../data/' + folder_name + '/preprocessed/relearning_data.csv')

# To assess choice updating at test

In [17]:
def get_reval_magnitude(condition):
    """
    Given a value for block condition, return the revaluation magnitude for each subject in that condition
    """
    # Get proportion of correct first-stage choices in the last 10 trials of value learning for each subject (excluding catch trials and timeouts)
    correct_before = learning_data[learning_data['block_condition']==condition].groupby('subject_id').tail(10).groupby('subject_id')['is_optimal_first'].mean()

    # Get proportion of matching first-stage choices at test (excluding timeouts)
    after = filtered_data[(filtered_data['task_part'] == 'first stage test') & (filtered_data['block_condition'] == condition) & (filtered_data['timeout'] == False)]
    match_after = after.groupby('subject_id')['correct'].mean()
    
    # For revaluation condition, 'correct' first stage choice does not match correct first stage choice during learning
    if condition == 'Revaluation':
        match_after = 1 - match_after
   
    reval_magnitude = correct_before - match_after
    return reval_magnitude.rename(condition)

# get revaluation magnitude for each subject and condition
reval_reval_magnitude = get_reval_magnitude('Revaluation')
control_reval_magnitude = get_reval_magnitude('Control')
reval_results = pd.concat([reval_reval_magnitude, control_reval_magnitude], axis=1)
reval_results = reval_results.reset_index().melt(id_vars=['subject_id'], value_vars=['Revaluation', 'Control'], var_name='block_condition', value_name='reval_score')

# add metadata on block_order, environment, rest, and age
metadata = filtered_data[['age','categorical_age','block_order','environment','subject_id','block_condition']].drop_duplicates(subset=['subject_id', 'block_condition'])
reval_results = reval_results.merge(metadata, on=['subject_id', 'block_condition'], how='left')
reval_results['rest'] = "No Rest" if rest_condition == "norest" else "Rest"

# Add participant's second stage test accuracy for each block condition
second_stage_test = filtered_data[(filtered_data['task_part'] == 'second stage test') & (filtered_data['timeout'] == False)].groupby(["subject_id","block_condition"])['correct'].mean().reset_index()
second_stage_test = second_stage_test.rename(columns={'correct': 'second_stage_test'})
reval_results = pd.merge(reval_results,second_stage_test,on=['subject_id','block_condition'])

# Add participant's last 10 learning accuracy for each block condition
last_10_accuracy = learning_data.groupby(['subject_id','block_condition']).tail(10).groupby(['subject_id','block_condition'])['correct'].mean().reset_index()
last_10_accuracy = last_10_accuracy.rename(columns={'correct': 'last_10_accuracy'})
reval_results = pd.merge(reval_results,last_10_accuracy,on=['subject_id','block_condition'])

reval_results.to_csv('../data/' + folder_name + '/preprocessed/data.csv')

# To assess memory for first-stage images

In [18]:
memory_data = filtered_data[(filtered_data['task_part']=='memory') & (filtered_data['timeout'] == False)].copy()
memory_data['environment'] = memory_data['stimulus'].str.split('/').str[3]
memory_data = memory_data[memory_data['environment'] != 'space']
memory_data = memory_data.drop(['block_condition', 'block_order'], axis=1)
metadata = filtered_data[['block_order','environment','subject_id','block_condition']].drop_duplicates()
memory_data = memory_data.merge(metadata, on=['subject_id', 'environment'], how='left')
memory_data.to_csv('../data/' + folder_name + '/preprocessed/memory_data.csv')