In [4]:
import pandas as pd
import glob
import json
import ast

data_path = "../data/sailing/full_dataset/*.csv"
exact_ages = pd.read_csv("../data/sailing/exact_ages.csv")
csv_files = glob.glob(data_path)

In [10]:
# Parse data from CSVs into joined dataframe
data_df = []
data_quality_df = pd.DataFrame(columns = ["browser_interactions","missed_trials", "avg_rt","comp_retries","categorical_age", "excluded"])
comprehension_df = []

for f in csv_files:
    df = pd.read_csv(f)
    completed = 'survey-debrief' in df['trial_type'].unique()
    if completed:
        metadata = json.loads(df.iloc[1]['metadata'])
        df['sub'] = metadata['subject_id'] 
        df['age'] = float(exact_ages[exact_ages['Subject ID'] == int(metadata['subject_id'])].iloc[0]['Age'])
        # assign categorical age if age <=12 'kids, 13-18 'teens', 19-64 'adults', 65+ 'seniors'
        df['categorical_age'] = pd.cut(df['age'], bins=[7,13,18,23], labels=['Children','Adolescents','Adults'])
        
        browser_interact = df['browser_interaction'].count()
        comp_errors = df['num_errors'].sum()
        timeouts = df['timeout'].sum()
        
        comprehension_df.append(df[df['trial_type'] == 'sailing-comprehension'][['sub','responses','num_errors']])
       
        df = df[(df.index > df[df['trial_type'] == 'sailing-instructions'].index[-1]) & (df['trial_type'] == 'sailing-trial')]
        df['reward_probabilities'] = df.apply(lambda x: ast.literal_eval(x.reward_probabilities), axis=1)
        df = df.drop(['trial_type','trial_index','time_elapsed','internal_node_id','value','metadata','success','failed_images','failed_audio','failed_video','browser_interaction','low_quality','stimulus','response','rt','responses','num_errors','view_history'], axis=1)
        full_trials = df.iloc[2::2].reset_index()
        boat_trials = df.iloc[1:-1:2].reset_index()
        full_trials['prior_timeout'] = boat_trials['timeout']
        full_trials['prior_boat'] = boat_trials['boat']
        full_trials['prior_reward'] = boat_trials['reward']
        full_trials['trial_num'] = full_trials.index
        
        mean_RT = pd.concat([df['boat_rt'], df['island_rt']], ignore_index=True).mean()
        low_quality = browser_interact > 20 or comp_errors > 3 or timeouts > 10 or mean_RT > 5000
        data_quality_df.loc[int(metadata['subject_id'])] = [browser_interact,timeouts,mean_RT, comp_errors, df['categorical_age'].iloc[0], low_quality]
        if not low_quality:
            data_df.append(full_trials)
        
data_df = pd.concat(data_df, ignore_index=True)
comprehension_df = pd.concat(comprehension_df, ignore_index=True)


In [3]:
data_df = data_df[(data_df['timeout']==False) & (data_df['prior_timeout']==False)]
data_df['action1TowardsPrevEnd'] = (2*data_df['island'] <= data_df['prior_boat']) & (data_df['prior_boat'] <= (2*data_df['island']+1))
data_df['action1TowardsPrevEnd'] = data_df['action1TowardsPrevEnd'].astype(int)

def is_congruent(before, current):
    if type(before) != type(current):
        return None
    if (before[0] < 0.5 and current[0] < 0.5) or (before[0] >= 0.5 and current[0] >= 0.5):
        return 1
    else:
        return 0

# create block and block_change_congruent columns 
data_df['block'] = data_df.groupby('sub')['reward_probabilities'].apply(lambda x: (x != x.shift()).cumsum())
x = data_df.groupby(['sub','block']).first().reset_index()
x['prior_reward_probabilities'] = x.groupby('sub')['reward_probabilities'].shift()
x['block_change_congruent'] = x.apply(lambda row: is_congruent(row['prior_reward_probabilities'], row['reward_probabilities']), axis=1)
data_df = data_df.merge(x[['sub', 'block', 'block_change_congruent']], on=['sub', 'block'], how='left')

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  data_df['block'] = data_df.groupby('sub')['reward_probabilities'].apply(lambda x: (x != x.shift()).cumsum())


In [4]:
def get_prior_boat_choice(row,df=data_df):
    # Filter the DataFrame to get rows with the same subject_id and accessible island
    subset = df[(df['sub'] == row['sub']) & (df['island'] == row['prior_boat']//2)]
    
    # Get the prior row (excluding the current row)
    prior_row = subset[subset['trial_num'] < row['trial_num']].tail(1)
    
    # If there is a prior row, return the prior boat choice, otherwise, return NaN
    return prior_row['boat'].values[0]==row['prior_boat'] if not prior_row.empty else None

def get_boat_neighbor_prior_reward(row, df=data_df):
    # Filter the DataFrame to get rows with the same subject_id and accessible island
    neighbor_boat_mapping = {1: 0, 0: 1, 3: 2, 2: 3}
    neighbor_boat = neighbor_boat_mapping[row['prior_boat']]
    subset = df[(df['sub'] == row['sub']) & ((df['boat'] == neighbor_boat) | (df['prior_boat'] == neighbor_boat))]
    
    # Get the prior row (excluding the current row)
    prior_row = subset[subset['trial_num'] < row['trial_num']].tail(1)

    rewarded = None
    if not prior_row.empty:
        if prior_row['prior_boat'].values[0] == neighbor_boat:
            rewarded = prior_row['prior_reward'].values[0]
        if prior_row['boat'].values[0] == neighbor_boat:
            rewarded = prior_row['reward'].values[0]
    return rewarded

In [5]:
data_df['prior_boat_choice'] = data_df.apply(get_prior_boat_choice, axis=1)
data_df['boat_neighbor_prior_reward'] = data_df.apply(get_boat_neighbor_prior_reward, axis=1)
data_df = data_df[(data_df['prior_boat_choice'].notnull()) & (data_df['boat_neighbor_prior_reward'].notnull()) & (data_df['prior_reward'].notnull())].copy()
data_df['prior_boat_choice'] = data_df['prior_boat_choice'].astype(int)
data_df['repeat_island'] = (data_df['island'] == data_df['island'].shift(1)).astype(int)
data_df['prior_island_reward'] = data_df['reward'].shift(1)
data_df = data_df[(data_df['prior_island_reward'].notnull())].copy()

In [10]:
data_df.to_csv('../data/sailing/preprocessed/sailing_data.csv')
data_quality_df.to_csv('../data/sailing/preprocessed/sailing_data_quality.csv')