In [1]:
#import modules 
import pandas as pd #dataframes 
import glob #for pattern checking 
import re  #regular expressions
import numpy as np #numerical computing 
import matplotlib.pyplot as plt #plotting
import seaborn as sns 
from sys import platform #for checking platform 

#change the home directory based on os
if platform == 'linux':
    home = '/home/krista/'
elif platform == 'darwin': 
    home = '/Users/67981492/'

data_path=home+'Dropbox/volatileValues/simple_rt_experiment_probabilityC/data'
print_data_path=home+'Dropbox/volatileValues/simple_rt_experiment_probabilityC/analysis/aggregated_data/'

#cond0: lc
#cond1: hc
#cond2: hv
#cond3: lv

In [2]:
#calculate expected number of files for error-checking
n_conditions = 4
n_participants = 24 
n_clean_trials = 600

total_n_files = n_conditions*n_participants
total_n_clean_trials = total_n_files*n_clean_trials
print(total_n_files)
print(total_n_clean_trials)

96
57600


In [3]:
#search for pattern that indicates the dataset
all_data_list = glob.glob(data_path+'/07[0-9][0-9]_cond[0-9]_trialset*[0-9].csv')
len(all_data_list) == total_n_files

#compile regular expressions for ID and condition 
ID_re = re.compile('07[0-9][0-9]')
condition_re = re.compile('cond[0-9]')

#find matching expression in list 
IDs = np.hstack(map(ID_re.findall, all_data_list))
cond_codes = np.hstack(map(condition_re.findall, all_data_list))

#decode condition numbers 
cond_names = ['lc', 'hc', 'hv', 'lv']
decoded_conditions = np.hstack(map(lambda x: cond_names[int(x[-1])], cond_codes))

#check unique IDs and conditions
unique_IDs = np.unique(IDs)
unique_conditions = np.unique(decoded_conditions)

print(unique_IDs)
print(unique_conditions)

print(len(unique_IDs))
print(len(unique_conditions))

['0761' '0762' '0763' '0764' '0765' '0766' '0767' '0768' '0769' '0770'
 '0771' '0772' '0773' '0774' '0775' '0776' '0777' '0778' '0779' '0780'
 '0781' '0782' '0783' '0784']
['hc' 'hv' 'lc' 'lv']
24
4


In [8]:
#construct a dataframe 
data_df = pd.concat((pd.read_csv(d) for d in all_data_list))
#strip whitespace from col. names for indexing later
data_df.columns = data_df.columns.str.strip()

#create vectors of IDs and conditions 
data_df['ID'] =  np.hstack(np.repeat(id, len(pd.read_csv(fname))) for id,fname in zip(IDs,all_data_list))
data_df['condition'] = np.hstack(np.repeat(c, len(pd.read_csv(fname))) for c,fname in zip(decoded_conditions,all_data_list))

#strip leading zeros from IDs... python3 cannot handle them
data_df.ID = data_df.ID.str.lstrip('0')


data_df.head(10)
data_df.tail()

#n_trials will be longer than n_clean_trials if repeat trials [too fast/slow]
data_df.shape

(58379, 13)

In [None]:
#find all fast/slow trials
fast_trials = data_df.cp_with_slow_fast == -2
slow_trials = data_df.cp_with_slow_fast == -1
lc = data_df.condition == 'lc'
hc = data_df.condition == 'hc'
lv = data_df.condition == 'lv'
hv = data_df.condition == 'hv'

In [22]:
data_df[fast_trials].rt.head(), data_df[slow_trials].rt.head()

(4      0.048233
 34     0.006824
 88     0.047692
 142    0.024710
 151    0.099472
 Name: rt, dtype: float64, 35     1.800549
 125    1.000200
 128    1.532096
 152    1.292333
 163    1.649170
 Name: rt, dtype: float64)

In [24]:
data_df[lc].head()

Unnamed: 0,choice,accuracy,solution,reward,cumulative_reward,rt,total_trial_time,iti,cp_with_slow_fast,obs_cp_with_slow_fast,high_val_cue,ID,condition
0,0.0,1.0,0.0,2.0,601.0,0.713345,2.081675,0.366715,0.0,0.0,112.0,784,lc
1,1.0,0.0,0.0,0.0,600.0,0.410072,2.149507,0.749229,0.0,0.0,112.0,784,lc
2,0.0,0.0,1.0,0.0,599.0,0.690292,2.287885,0.60391,0.0,0.0,112.0,784,lc
3,0.0,0.0,1.0,0.0,598.0,0.498906,1.84399,0.348191,0.0,0.0,112.0,784,lc
4,0.0,1.0,0.0,2.0,599.0,0.630659,2.245682,0.626968,0.0,0.0,112.0,784,lc


In [34]:
#clean df 
#only keep trials with cp or 0 indicators (get rid of fast/slow trials)
cleaned_df = data_df[(data_df.cp_with_slow_fast == 1) | (data_df.cp_with_slow_fast == 0)]
#should be equal
cleaned_df.shape[0] == total_n_clean_trials
#should be empty df 
cleaned_df[cleaned_df.rt > 1], cleaned_df[cleaned_df.rt < .1]

(Empty DataFrame
 Columns: [choice, accuracy, solution, reward, cumulative_reward, rt, total_trial_time, iti, cp_with_slow_fast, obs_cp_with_slow_fast, high_val_cue, ID, condition]
 Index: [], Empty DataFrame
 Columns: [choice, accuracy, solution, reward, cumulative_reward, rt, total_trial_time, iti, cp_with_slow_fast, obs_cp_with_slow_fast, high_val_cue, ID, condition]
 Index: [])

In [39]:
#median rts 
print('median rt for lc', np.median(cleaned_df.rt[cleaned_df.condition == 'lc'])) 
print('median rt for lv', np.median(cleaned_df.rt[cleaned_df.condition == 'lv']) )
print('median rt for hc', np.median(cleaned_df.rt[cleaned_df.condition == 'hc'])) 
print('median rt for hv', np.median(cleaned_df.rt[cleaned_df.condition == 'hv']) )

median rt for lc 0.304419517517
median rt for lv 0.301387429237
median rt for hc 0.304340004921
median rt for hv 0.311047911644


In [None]:
#not printing mean accuracies because error in automated acc. calculation. Fixed in preprocess_df. 

In [14]:
##print clean df 
# cleaned_df.to_csv(print_data_path + 'cleaned_probC_data.csv', index = False)
##print raw df 
# data_df.to_csv(print_data_path + 'raw_probC_data.csv', index = False)