In [1]:
import pandas as pd
import glob
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import numpy as np

In [18]:
rest_condition = "norest"
folder_name = "experiment2_norest"
data_path = "../data/" + folder_name + "/full_dataset/*.csv"
exact_ages = pd.read_csv("../data/" + folder_name + "/exact_ages.csv")
csv_files = glob.glob(data_path)

In [19]:
def is_optimal_first(x):
    try:
        rewardMap = literal_eval(x.rewardMap)
        if x.isCatch or x.task_part != 'value training': return None
        return x.response0 == max(rewardMap, key=rewardMap.get).split(',')[0]
    except:
        return None
    
def optimal_score(x):
    try:
        rewardMap = literal_eval(x.rewardMap)
        if x.isCatch or x.task_part != 'value training': return None
        return max(rewardMap.values())
    except:
        return None

def is_optimal_second(x):
    try:
        rewardMap = literal_eval(x.rewardMap)
        subset = {k: v for k, v in rewardMap.items() if x.response0 in k}
        if x.isCatch or x.task_part != 'value training': return None
        return x.response1 == max(subset, key=subset.get).split(',')[1]
    except:
        return None
    
def reformat_rt(x):
    try:
        if type(literal_eval(str(x)))==int:
            return [literal_eval(str(x))]
        else:
            return literal_eval(str(x)) 
    except:
        return [None,None]   
    
def reformat_response(x):
    try:
        if '[' not in x:
            return [x]
        else:
            return literal_eval(str(x))
    except:
        return [None,None]  

In [20]:
filtered_data = []
excluded = []

for f in csv_files:
    df = pd.read_csv(f)
    subject = df.loc[0]["subject_id"]
    condition = df.loc[0]["assigned_condition"]
    if 'bonus' in df:
        # calculate exclusion criteria for each subject
        interactions = df['browser_interaction'][df['browser_interaction'].notnull()].iloc[-1].count("event")
        comprehension_retries = df.loc[(df['task_part'] == 'comp_question') & (df['correct'] == False)].shape[0]
        dots_missed = 0
        if rest_condition == "rest":
            dots_missed = df['numberMissed'].sum()
        catch_correct = df[(df['isCatch'] == True) & (df['task_part']!="catch practice")]["correct"].sum()
        task_timeouts = df[(df['trial_type'] == "revaluation") & (df['environment'] != 'space')]['timeout'].mean()
        memory_timeouts = df.loc[(df['trial_type'] == "memory-trial")]['timeout'].mean()
        test_timeouts = df[(df['task_part'] == "first stage test")]['timeout'].mean()
        block1_lasthalf_accuracy = df[(df['order'] == 1) & (df['task_part'] == 'value training') & (df['isCatch']==False)].tail(21)['correct'].mean()
        block2_lasthalf_accuracy = df[(df['order'] == 2) & (df['task_part'] == 'value training') & (df['isCatch']==False)].tail(21)['correct'].mean()
       
        if interactions > 20 or task_timeouts > 0.15 or memory_timeouts>0.15 or comprehension_retries > 4 or catch_correct<11 or block1_lasthalf_accuracy<0.75 or block2_lasthalf_accuracy<0.75 or (rest_condition == "rest" and dots_missed > 4):
            excluded.append({'subject':subject,'catch_correct':catch_correct, 'block1_second_half_accuracy':block1_lasthalf_accuracy, 'block2_second_half_accuracy':block2_lasthalf_accuracy, 'task_timeouts':task_timeouts, 'memory_timeouts':memory_timeouts, 'comprehension_retries':comprehension_retries, 'interactions':interactions, 'dots_missed': dots_missed})
        else:
            # keep only relevant columns
            filtered = df[['subject_id','task_part', 'rt', 'timeout', 'stimulus', 'response','environment','score', 'isCatch', 'correct', 'condition', 'order', 'rewardMap','trial_num', 'ground_truth', 'age', 'gender']]
            filtered = filtered[filtered['task_part'].isin(['value training', 'revaluation','first stage test','second stage test', 'memory'])]
            
            # add task condition and age info
            filtered['subject_condition'] = condition
            filtered['rest'] = rest_condition
            filtered['age'] = float(exact_ages[exact_ages['subject_id'] == int(subject)].iloc[0]['Age'])

            # reformatting trial_num, RT, and response data
            filtered['trial_num'] = filtered['trial_num'].astype('Int64') // 2
            rt_list = filtered['rt'].apply(lambda x: reformat_rt(x)).values.tolist()
            rt_df = pd.DataFrame(rt_list).add_prefix('rt')  
            filtered = filtered.reset_index().join(rt_df).drop('rt', axis=1)        
            response_list = filtered['response'].apply(lambda x: reformat_response(x)).values.tolist()
            response_df = pd.DataFrame(response_list).add_prefix('response')  
            filtered = filtered.reset_index().join(response_df).drop('response', axis=1)
            
            # compute is_optimal_first, is_optimal_second, and optimal_score for each trial
            first = filtered.apply(lambda x: is_optimal_first(x), axis=1)
            second = filtered.apply(lambda x: is_optimal_second(x), axis=1)
            filtered = filtered.merge(first.rename("is_optimal_first"),left_index=True, right_index=True)
            filtered = filtered.merge(second.rename("is_optimal_second"),left_index=True, right_index=True)
            filtered['optimal_score'] = filtered.apply(optimal_score, axis=1)
            
            filtered_data.append(filtered)
    else:
        pass

filtered_data = pd.concat(filtered_data, ignore_index=True)
filtered_data['categorical_age'] = pd.cut(filtered_data['age'], bins=[7,13,18,24], labels=['Children','Adolescents','Adults'])
# rename columns "order" to "block_order" and "condition" to "block_condition"
filtered_data = filtered_data.rename(columns={'order':'block_order', 'condition':'block_condition'})

excluded = pd.DataFrame(excluded)
excluded.to_csv('../data/' + folder_name + '/preprocessed/excluded.csv')

In [21]:
# To assess optimal choices during learning
learning_data = filtered_data[(filtered_data['task_part']=='value training')&(filtered_data['isCatch']==False)].copy() # keep only non-catch learning trials
learning_data['trial_num'] = learning_data.groupby(['subject_id', 'block_condition']).cumcount() + 1                  # reset trial number after removing catch 
learning_data['trial_bin'] = learning_data.groupby(['subject_id','block_order']).cumcount() // 5 * 5                  # get binned trial num for graphing smooth learning curve
learning_data = learning_data[learning_data['timeout']==False]                                                  # keep only trials where participants did not time out
learning_data['is_optimal'] = learning_data['score'] == learning_data['optimal_score']                          # binary optimal score metric for performance
learning_data.to_csv('../data/' + folder_name + '/preprocessed/learning_data1.csv')

# To assess age differences in first-stage vs second-stage choices
learning_data_2 = learning_data.copy().melt(
    id_vars=['subject_id','age','trial_num','block_condition','block_order'],
    value_vars=['is_optimal_first', 'is_optimal_second'],
    var_name='stage',
    value_name='optimal'
)
learning_data_2['stage'] = learning_data_2['stage'].replace({'is_optimal_first': 'First', 'is_optimal_second': 'Second'})
learning_data_2.to_csv('../data/' + folder_name + '/preprocessed/learning_data2.csv')

# To assess optimal choices during relearning
relearning_data = filtered_data[filtered_data['task_part']=='revaluation'].copy()
relearning_data['trial'] = relearning_data.groupby(['subject_id', 'stimulus'])['stimulus'].cumcount()   # Calculate trial number per second stage state
relearning_data.to_csv('../data/' + folder_name + '/preprocessed/relearning_data.csv')

In [22]:
# get metadata about each subject block (order, stimuli)
block_conditions = filtered_data.groupby(['subject_id','block_condition'],as_index=False).first()[['subject_id','block_condition','environment','block_order']]
block_conditions['block_condition'] = block_conditions['block_condition'].apply(lambda x: "Control" if x=='original' else "Revaluation")

# get age info for each subject
demographics = filtered_data.groupby('subject_id').first()[['age','gender','categorical_age']]

learning_first_stage_accuracy = learning_data.groupby(['subject_id','block_condition']).tail(10).groupby(['subject_id','block_condition'])['is_optimal_first'].mean().reset_index()
last_10_accuracy = learning_data.groupby(['subject_id','block_condition']).tail(10).groupby(['subject_id','block_condition'])['is_optimal'].mean().reset_index()

In [23]:
memory_data = filtered_data[filtered_data['task_part']=='memory'].copy()
memory_data['environment'] = memory_data['stimulus'].str.split('/').str[3]
memory_data = memory_data[memory_data['environment'] != 'space']
memory_data = memory_data.drop(['block_condition', 'block_order'], axis=1)
memory_data = memory_data.merge(block_conditions,on=['subject_id','environment'])
memory_data.to_csv('../data/' + folder_name + '/preprocessed/memory_data.csv')


In [24]:
def get_reval_magnitude(filtering, name):
    before = filtered_data[(filtered_data['task_part'] == 'value training') & filtering & (filtered_data['isCatch']==False)].groupby('subject_id').tail(10).copy()
    after = filtered_data[(filtered_data['task_part'] == 'first stage test') & filtering].copy()
    reward_map_before = before.groupby('subject_id').first()['rewardMap'].apply(lambda x: literal_eval(x))
    correct_response = reward_map_before.apply(lambda x: max(x, key=x.get).split(",")[0])
    true_accuracy_before = before.groupby('subject_id')['correct'].mean()
    before['correct'] = before.apply(lambda x: x.response0 == correct_response[x.subject_id], axis=1)
    after['correct'] = after.apply(lambda x: x.response0 == correct_response[x.subject_id], axis=1)
    correct_before = before.groupby('subject_id')['correct'].mean()
    match_after = after.groupby('subject_id')['correct'].mean()
    reval_magnitude = correct_before - match_after
    single_reval = after.groupby('subject_id').head(1).apply(lambda x: x.response0 != correct_response[x.subject_id], axis=1)
    single_reval.index = after.groupby('subject_id').head(1)['subject_id']
    return reval_magnitude.rename(name), true_accuracy_before.rename(name+'_correct_before'), single_reval.rename(name+'_single')

reval_reval_magnitude, reval_correct_before, reval_single = get_reval_magnitude(filtered_data['block_condition']=='revaluation', 'Revaluation')
control_reval_magnitude, control_correct_before, control_single = get_reval_magnitude(filtered_data['block_condition']=='original', 'Control')
first_reval_magnitude, _, _ = get_reval_magnitude(filtered_data['block_order']==1, 'first')
second_reval_magnitude, _, _ = get_reval_magnitude(filtered_data['block_order']==2, 'second')
reval_results = pd.concat([reval_reval_magnitude, control_reval_magnitude, first_reval_magnitude, second_reval_magnitude, reval_correct_before, control_correct_before, reval_single, control_single], axis=1)
reval_results = pd.merge(reval_results,demographics, on="subject_id")

In [25]:
reval_results['block_condition'] = 'Revaluation'
participant_data_processed = reval_results[['age', 'block_condition','categorical_age']]
participant_data_processed['reval_score'] = reval_results['Revaluation']
participant_data_processed['reval_single'] = reval_results['Revaluation_single']
participant_data_processed['last_10_accuracy'] = reval_results['Revaluation_correct_before']
participant_data_processed = participant_data_processed.reset_index()
control = reval_results.reset_index()
control['block_condition'] = 'Control'
control['reval_score'] = control['Control']
control['reval_single'] = control['Control_single']
control['last_10_accuracy'] = control['Control_correct_before']
participant_data_processed = pd.concat([participant_data_processed,control[['subject_id', 'age', 'block_condition', 'reval_score','last_10_accuracy','categorical_age','reval_single']]])
participant_data_processed = participant_data_processed.merge(block_conditions,on=['subject_id','block_condition'])
rest_name = "No Rest"
if rest_condition == "rest":
    rest_name = "Rest"
participant_data_processed['rest'] = rest_name
participant_data_processed['second_stage_accuracy'] = filtered_data.where(filtered_data['task_part'] == 'second stage test').groupby('subject_id')['correct'].mean()
participant_data_processed['revaluation_accuracy'] = filtered_data.where(filtered_data['task_part'] == 'revaluation').groupby('subject_id')['correct'].mean()
participant_data_processed = participant_data_processed.set_index('subject_id')
participant_data_processed['second_stage_accuracy'] = filtered_data.where(filtered_data['task_part'] == 'second stage test').groupby('subject_id')['correct'].mean()
participant_data_processed['revaluation_accuracy'] = filtered_data.where(filtered_data['task_part'] == 'revaluation').groupby('subject_id')['correct'].mean()

second_stage_test = filtered_data[filtered_data['task_part'] == 'second stage test'].groupby(["subject_id","block_condition"])['correct'].mean().reset_index()
second_stage_test['block_condition'] = second_stage_test['block_condition'].apply(lambda x: "Control" if x=='original' else "Revaluation")
pd.merge(participant_data_processed,second_stage_test,on=['subject_id','block_condition'])

participant_data_processed.to_csv('../data/' + folder_name + '/preprocessed/data.csv')
participant_data_processed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participant_data_processed['reval_score'] = reval_results['Revaluation']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participant_data_processed['reval_single'] = reval_results['Revaluation_single']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participant_data_processed['last_10_accuracy'] = re

Unnamed: 0_level_0,age,block_condition,categorical_age,reval_score,reval_single,last_10_accuracy,environment,block_order,rest,second_stage_accuracy,revaluation_accuracy
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8171,10.58,Revaluation,Children,0.50,False,1.0,canyon,1.0,No Rest,0.9375,0.888889
8175,11.64,Revaluation,Children,0.25,False,1.0,ocean,1.0,No Rest,0.4375,0.694444
8191,15.44,Revaluation,Adolescents,-0.10,False,0.9,canyon,1.0,No Rest,0.7500,0.944444
8200,19.77,Revaluation,Adults,0.50,False,1.0,ocean,1.0,No Rest,0.4375,0.805556
8201,19.34,Revaluation,Adults,0.00,False,1.0,ocean,1.0,No Rest,1.0000,0.888889
...,...,...,...,...,...,...,...,...,...,...,...
13778,18.21,Control,Adults,0.00,False,1.0,ocean,1.0,No Rest,0.9375,0.888889
13782,18.60,Control,Adults,0.00,False,1.0,canyon,2.0,No Rest,1.0000,0.916667
14425,18.51,Control,Adults,0.25,False,1.0,ocean,1.0,No Rest,1.0000,0.888889
14427,22.42,Control,Adults,0.00,False,1.0,canyon,1.0,No Rest,1.0000,0.944444
