In [1]:
import pandas as pd
import numpy as np
from glob import glob
from os import path
from scipy import stats
import pingouin as pg


import matplotlib.pyplot as plt
import seaborn as sns
from statannot import add_stat_annotation
from IPython.display import display


import os
from collections import defaultdict
from dual_data_utils import get_exp_id, update_correct_trial, standardize_conditions, add_inhib_acc

  **kwargs
  **kwargs


In [2]:
project_dir = '/Users/henrymj/Documents/r01network/mturk/'
raw_dir = project_dir + 'all_data/raw_data/'

ABBREV = {'directed_forgetting_condition': 'DF',
          'flanker_condition':'FLANKER',
          'go_nogo_condition': 'GNG',
          'n_back_condition': 'NBACK',
          'delay_condition': 'DELAY',
          'predictable_condition': 'PREDICT',
          'shape_matching_condition': 'SHAPE',
          'stop_signal_condition': 'SS',
          'task_switch_condition': 'TSWITCH',
          'cue_condition': 'CUE',
          'cued_condition': 'CUE'}

EXPLORE_IDS = ['s264', 's419', 's010', 's320', 's142', 's025', 's066',
               's069', 's205', 's341', 's380', 's141', 's397', 's376',
               's090', 's248', 's207', 's429', 's441', 's214', 's126',
               's539', 's369', 's044', 's454', 's005', 's396', 's365',
               's135', 's360', 's295', 's490', 's350']

CONDITION_RENAME_MAP = {
    'predictive_condition': 'predictable_condition',
    'predictive_dimension': 'predictable_dimension',
    'directed_condition': 'directed_forgetting_condition',
    'task_condition': 'task_switch_condition',
    'delay': 'delay_condition',
    'central_letter': 'center_letter',
    }
DROP_COLS = ['Unnamed: 0', 'final_accuracy',
                'final_avg_rt', 'final_credit_var',
                'final_missed_percent', 'final_responses_ok',
                'internal_node_id', 'responses',
                'stimulus', 'text',
                'trial_index', 'trial_type',
                'view_history']


In [3]:
def make_clean_concat_data(filter_exp='all', stop_subset=False, dataset='explore'):
    sub_list = EXPLORE_IDS if dataset=='explore' else []
    task_dfs = defaultdict(pd.DataFrame)
    explore_files = [i for i in glob(raw_dir + 's*/*') if (i.split('_')[-1].replace('.csv','') in sub_list) and ('demographics' not in i)]
    if stop_subset:
        explore_files = [i for i in explore_files if ('stop' in i) and ('go_no_go' not in i)]
    else:
        explore_files = [i for i in explore_files if ('stop' not in i) and ('go_no_go' not in i)]
    for subj_file in explore_files:
        df, exp_id = read_and_filter_df(subj_file, filter_exp=filter_exp)
        task_dfs[exp_id] = pd.concat([task_dfs[exp_id], df], axis=0, sort=True)
    if stop_subset:
        assert len(task_dfs.keys())==7
    else:
        assert len(task_dfs.keys())==36
    return task_dfs

def read_and_filter_df(subj_file, filter_exp='all'):
    base_file = os.path.basename(subj_file) 
    worker_id = subj_file.split('_')[-1].replace('.csv','')
    df = pd.read_csv(subj_file, index_col=0)
    if 'worker_id' not in df.columns:
        df.loc[:,'worker_id'] = worker_id    
    exp_id = get_exp_id(df, base_file)

    df = update_correct_trial(df)
    df = df.rename(columns=CONDITION_RENAME_MAP)
#     df = filter_to_test_trials(df, exp_id)
    if any(cond in base_file for cond in ['stop', 'go_no']):
        df = add_inhib_acc(df, '_'.join(base_file.split('_')[0:-1]))
    df = standardize_conditions(df, exp_id)
    df = df.drop(DROP_COLS, axis=1, errors='ignore')
    df = df.reset_index(drop=True)
    filter_map = {
        'all': lambda x: x,
        'odd': lambda x: x.iloc[1::2].reset_index(drop=True),
        'even': lambda x: x.iloc[::2].reset_index(drop=True),
        'first_half': lambda x: x.head(int(np.floor(len(x)/2))).reset_index(drop=True),
        'last_half': lambda x: x.tail(int(np.ceil(len(x)/2))).reset_index(drop=True),
    }
    df = filter_map[filter_exp](df)
    return df,exp_id

In [4]:
sub_list = EXPLORE_IDS
task_dfs = defaultdict(pd.DataFrame)
explore_files = [i for i in glob(raw_dir + 's*/*') if (i.split('_')[-1].replace('.csv','') in sub_list) and ('directed_forgetting' in i)]

for subj_file in explore_files:
    base_file = os.path.basename(subj_file) 
    worker_id = subj_file.split('_')[-1].replace('.csv','')
    df = pd.read_csv(subj_file, index_col=0)
    if 'worker_id' not in df.columns:
        df.loc[:,'worker_id'] = worker_id
    exp_id = get_exp_id(df, base_file)
    df = df.rename(columns=CONDITION_RENAME_MAP)
    task_dfs[exp_id] = pd.concat([task_dfs[exp_id], df], axis=0, sort=True)

In [5]:
task_dfs.keys()

dict_keys(['directed_forgetting_with_shape_matching', 'directed_forgetting_single_task_network', 'predictable_task_switching_with_directed_forgetting', 'cued_task_switching_with_directed_forgetting', 'n_back_with_directed_forgetting', 'go_nogo_with_directed_forgetting', 'directed_forgetting_with_flanker', 'stop_signal_with_directed_forgetting'])

In [6]:
# data_df = task_dfs['directed_forgetting_single_task_network'].query("worker_id=='s341'")

def recode_df_iti_resps(data_df, return_metrics=True):
    data_df = data_df.copy()
    trial_idxs = data_df.query("trial_id=='test_trial'").index
    problem_rows = data_df.iloc[trial_idxs+1].query('rt > 0')
    
    if return_metrics:
        metric_dict = {}
        metric_dict['nITIresps'] = len(problem_rows)
        test_df = data_df.query("trial_id=='test_trial'")
        metric_dict['meanCorrRT_pre'] = test_df.query("correct_trial==True").rt.mean()
        assert len(test_df.query("rt==0"))==0  # assume RT -1 or positive
        metric_dict['meanRespACC_pre'] = test_df.query("rt>0").correct_trial.mean()
    
    # time to fix 
    n_miscodes = 0
    for bad_row in problem_rows.iterrows():
        iti_idx = bad_row[0]
        bad_row = bad_row[1]
        prev_idx = iti_idx - 1
        if data_df.loc[prev_idx, 'rt']==-1: #sometimes there the ITI window is catching a second response
            n_miscodes += 1
            assert data_df.loc[prev_idx, 'key_press']==-1, print(data_df.worker_id.unique()[0], iti_idx)
            if not any(inhib_reg in bad_row['exp_id'] for inhib_reg in ['stop_signal', 'go_nogo']):
                assert data_df.loc[prev_idx, 'correct_trial']==False, print(data_df.worker_id.unique()[0], iti_idx)
            assert data_df.loc[prev_idx, 'rt']==-1, print(data_df.worker_id.unique()[0], iti_idx)
            data_df.loc[prev_idx, 'key_press'] = data_df.loc[iti_idx, 'key_press']
            data_df.loc[prev_idx, 'correct_trial'] = data_df.loc[prev_idx, 'key_press']==data_df.loc[prev_idx, 'correct_response']
            data_df.loc[prev_idx, 'rt'] = data_df.loc[iti_idx, 'rt'] + (data_df.loc[iti_idx, 'time_elapsed'] - data_df.loc[prev_idx, 'time_elapsed'])

    if return_metrics:
        metric_dict['nMiscodes'] = n_miscodes
        test_df = data_df.query("trial_id=='test_trial'")
        metric_dict['meanCorrRT_post'] = test_df.query("correct_trial==True").rt.mean()
        assert len(test_df.query("rt==0"))==0  # assume RT -1 or positive
        metric_dict['meanRespACC_post'] = test_df.query("rt>0").correct_trial.mean()
    if return_metrics:
        return metric_dict
    else:
        return data_df

In [7]:
task_dfs.keys()

dict_keys(['directed_forgetting_with_shape_matching', 'directed_forgetting_single_task_network', 'predictable_task_switching_with_directed_forgetting', 'cued_task_switching_with_directed_forgetting', 'n_back_with_directed_forgetting', 'go_nogo_with_directed_forgetting', 'directed_forgetting_with_flanker', 'stop_signal_with_directed_forgetting'])

In [8]:
miscode_metrics = task_dfs['directed_forgetting_single_task_network'].groupby('worker_id').apply(recode_df_iti_resps)
miscode_metrics.apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,11.242424,590.051945,0.94277,7.69697,608.710584,0.936203
std,17.374605,70.612217,0.046384,7.077627,81.508048,0.050055
min,0.0,440.15873,0.822222,0.0,440.15873,0.806283
25%,2.0,541.916667,0.927374,2.0,544.101064,0.90625
50%,6.0,583.559322,0.956989,6.0,601.873563,0.953125
75%,13.0,641.505435,0.973822,11.0,670.553672,0.973958
max,97.0,770.631579,0.99435,22.0,819.144737,0.994737


In [9]:
task_dfs['directed_forgetting_with_shape_matching'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,7.393939,589.857072,0.943161,5.909091,600.464376,0.939996
std,8.462932,68.091545,0.047144,6.231792,76.069082,0.050026
min,0.0,495.25431,0.792793,0.0,498.484979,0.780591
25%,2.0,536.703196,0.920168,2.0,539.095455,0.915254
50%,5.0,576.255411,0.96087,4.0,584.429787,0.962185
75%,9.0,623.147982,0.974895,8.0,644.309013,0.974895
max,45.0,772.837696,0.991597,29.0,816.305556,0.991667


In [10]:
task_dfs['predictable_task_switching_with_directed_forgetting'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,32.0,32.0,33.0,32.0,32.0
mean,9.666667,568.696656,0.896864,7.69697,585.139642,0.892851
std,10.922072,67.144537,0.110608,7.935225,76.778709,0.110763
min,0.0,445.033755,0.573394,0.0,445.033755,0.567797
25%,3.0,518.707473,0.886311,1.0,528.339551,0.881534
50%,7.0,560.762549,0.933304,6.0,576.051642,0.929592
75%,12.0,618.587705,0.966305,11.0,633.572042,0.96267
max,53.0,729.842391,0.990476,30.0,747.326425,0.987755


In [11]:
task_dfs['cued_task_switching_with_directed_forgetting'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,15.212121,554.466673,0.866219,7.909091,573.014604,0.860067
std,30.683625,74.552201,0.107672,9.152744,85.895637,0.109961
min,0.0,415.053097,0.562189,0.0,475.807175,0.552632
25%,2.0,494.738095,0.815385,2.0,509.651163,0.804444
50%,7.0,541.169312,0.913043,6.0,547.51269,0.895652
75%,12.0,597.630841,0.9375,9.0,605.995349,0.935065
max,138.0,730.276423,0.995614,33.0,824.158621,0.991304


In [12]:
task_dfs['n_back_with_directed_forgetting'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,19.242424,530.623411,0.735287,2.0,536.52207,0.734032
std,15.043749,96.07707,0.106842,2.936835,97.286849,0.107681
min,4.0,313.392523,0.422925,0.0,326.657407,0.425197
25%,9.0,453.833333,0.686275,0.0,455.889447,0.686275
50%,14.0,535.520231,0.741176,1.0,544.979381,0.738281
75%,28.0,609.812183,0.773438,2.0,612.156977,0.771429
max,75.0,708.680233,0.953488,12.0,720.413793,0.953488


In [13]:
task_dfs['go_nogo_with_directed_forgetting'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,7.272727,497.596875,0.930998,6.424242,514.938712,0.927336
std,8.765246,63.242774,0.056995,8.101043,76.393869,0.05785
min,0.0,386.079439,0.735294,0.0,386.079439,0.719577
25%,1.0,462.606796,0.909574,1.0,462.606796,0.909548
50%,5.0,485.38756,0.94898,3.0,510.849515,0.94359
75%,11.0,523.075117,0.968586,10.0,546.391089,0.963731
max,40.0,642.824121,0.994681,40.0,710.989691,0.989474


In [14]:
task_dfs['directed_forgetting_with_flanker'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,7.666667,607.718052,0.944786,6.666667,620.422781,0.94078
std,7.385233,68.552012,0.05224,7.135592,75.990493,0.056138
min,0.0,489.484848,0.748879,0.0,489.484848,0.731092
25%,2.0,553.748918,0.941176,1.0,567.83908,0.937238
50%,5.0,595.113043,0.958159,4.0,603.560345,0.958333
75%,12.0,656.056872,0.978166,11.0,668.380952,0.975
max,23.0,779.432692,0.995833,23.0,804.754464,0.995833


In [15]:
task_dfs['stop_signal_with_directed_forgetting'].groupby('worker_id').apply(recode_df_iti_resps).apply(pd.Series).describe()

Unnamed: 0,nITIresps,meanCorrRT_pre,meanRespACC_pre,nMiscodes,meanCorrRT_post,meanRespACC_post
count,33.0,33.0,33.0,33.0,33.0,33.0
mean,8.212121,452.989265,0.714941,6.393939,471.769089,0.716019
std,10.62061,64.754953,0.047477,10.365023,84.644917,0.046456
min,0.0,345.596774,0.541667,0.0,351.117647,0.557895
25%,2.0,400.32948,0.68984,1.0,408.60989,0.69378
50%,5.0,443.65625,0.718593,3.0,454.423913,0.726829
75%,11.0,495.910112,0.751381,6.0,536.692708,0.75
max,55.0,591.882759,0.778351,55.0,740.139785,0.773869
