In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from convert_eprime import convert as ep

In [2]:
source_dir = Path('..') / '..' / 'sourcedata'
derivs_dir = Path('..') / '..' / 'derivatives' / '0.3.cleaned'

#### Define a function to clean the N-back data

Stack the blocks vertically instead of horizontally, label the trial rows properly, and tag each trial as a HIT, MISS, FA, CR. We are also ouputting a new CSV data file in the sourcedata folder, all cleaned-up.

In [3]:
def nstack_score_label(infile):
    
    df = pd.read_excel(infile)
    
    # Hierarchicalize the column index
    df.columns=pd.MultiIndex.from_tuples([
        (df.columns[0].split('.')[0],df.columns[0].split('.')[1]),
        (df.columns[1].split('.')[0],df.columns[1].split('.')[1]),
        (df.columns[2].split('.')[0],df.columns[2].split('.')[1]),
        (df.columns[3].split('.')[0],df.columns[3].split('.')[1]),
        (df.columns[4].split('.')[0],df.columns[4].split('.')[1]),
        (df.columns[5].split('.')[0],df.columns[5].split('.')[1]),
    ])
    
    # Stack blocks, Reset trial row index, and Rename columns to be descriptive
    df = df.stack(0).reset_index().rename(
        columns={'level_0':'trial','level_1':'block'}
    ).sort_values(['block','trial'])
    df['sub'] = infile.name.split('_')[0].split('-')[1]
    df['block'] = df['block'].str[1]
    df['trial'] = df['trial'] + 1
    df = df.set_index([
        'sub',
        'block',
        'trial'
    ])
    
    # Determine Hits, CRs, FAs
    cr_mask = (df['Rsp'] == 0) & (df['CRsp'] == 0)
    ms_mask = (df['Rsp'] == 0) & (df['CRsp'] == 1)
    fa_mask = (df['Rsp'] == 1) & (df['CRsp'] == 0)
    ht_mask = (df['Rsp'] == 1) & (df['CRsp'] == 1)
    df['CR']   = cr_mask.astype(int)
    df['MISS'] = ms_mask.astype(int)
    df['FA']   = fa_mask.astype(int)
    df['HIT']  = ht_mask.astype(int)
    
    # Convert RT 0 to RT NaN
    df['RT'] = df['RT'].replace(0,np.NaN)
    
    # Output to new CSV datafile
    return(df)

#### Read all the subject data

Reading only data for the full sample (100-series YA & 200-series OA). Executing N-back data cleaning & EPrime text-to-csv conversion. Setting up for subject-level analysis.

In [4]:
ep_frames=[]
ptb_frames=[]

for sub_dir in source_dir.glob('sub-[1-2]*'):
    for infile in sub_dir.glob('*'):
        outfile = infile.parent / (infile.stem + '.csv')
        if infile.name.split('_')[-1] == 'beh.txt':
            print(infile)
            if outfile.is_file():
                print(outfile.name, 'exists')
            else:
                ep.text_to_csv(infile, outfile)
            ep_frames.append(pd.read_csv(outfile))
        if infile.name.split('_')[-1] == 'beh.xlsx':
            print(infile)
            if outfile.is_file():
                print(outfile.name, 'exists')
            else:
                nstack_score_label(infile).to_csv(outfile)
                print('Output file successfully created-', outfile)
            ptb_frames.append(pd.read_csv(outfile))
print('Done!')

..\..\sourcedata\sub-101\sub-101_task-nback_beh.xlsx
sub-101_task-nback_beh.csv exists
..\..\sourcedata\sub-101\sub-101_task-procspeed_run-1_beh.txt
sub-101_task-procspeed_run-1_beh.csv exists
..\..\sourcedata\sub-102\sub-102_task-nback_beh.xlsx
sub-102_task-nback_beh.csv exists
..\..\sourcedata\sub-102\sub-102_task-procspeed_run-1_beh.txt
sub-102_task-procspeed_run-1_beh.csv exists
..\..\sourcedata\sub-103\sub-103_task-nback_beh.xlsx
sub-103_task-nback_beh.csv exists
..\..\sourcedata\sub-103\sub-103_task-procspeed_run-1_beh.txt
sub-103_task-procspeed_run-1_beh.csv exists
..\..\sourcedata\sub-104\sub-104_task-nback_beh.xlsx
sub-104_task-nback_beh.csv exists
..\..\sourcedata\sub-104\sub-104_task-procspeed_run-1_beh.txt
sub-104_task-procspeed_run-1_beh.csv exists
..\..\sourcedata\sub-105\sub-105_task-nback_beh.xlsx
sub-105_task-nback_beh.csv exists
..\..\sourcedata\sub-105\sub-105_task-procspeed_run-1_beh.txt
sub-105_task-procspeed_run-1_beh.csv exists
..\..\sourcedata\sub-106\sub-106_ta

#### Output N-back trial-level data

In [5]:
nback_trials = pd.concat(ptb_frames)

In [6]:
fpath = derivs_dir / 'nback_trial_level.csv'
nback_trials.to_csv(fpath,index=False)

### Group, expand, trim N-back data
Group by subjects, get the sum of all columns, the count of the trial column, and the mean of the RT column.

Establish Hit % `number of Hits / number of targets` and FA % `number of FAs / number of foils`. 

Corrected Recognition `HIT% - FA%`. 

In [7]:
nback_hits = nback_trials[nback_trials['HIT'] == 1]
grouped_trials = nback_trials.groupby('sub')
grouped_hits = nback_hits.groupby('sub')

In [8]:
nback_subs = grouped_trials.sum()
nback_subs['trial'] = grouped_trials.count()['trial']
nback_subs['RT'] = grouped_hits.mean()['RT']
nback_subs['HIT%'] = nback_subs['HIT'] / nback_subs['CRsp']
nback_subs['FA%'] = nback_subs['FA'] / (nback_subs['trial'] - nback_subs['CRsp'])
nback_subs['CoR'] = nback_subs['HIT%'] - nback_subs['FA%']
nback_subs = nback_subs[['RT','HIT%','FA%','CoR']]
nback_subs.head()

Unnamed: 0_level_0,RT,HIT%,FA%,CoR
sub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101,685.285714,0.875,0.0625,0.8125
102,572.0,0.75,0.15625,0.59375
103,618.571429,0.875,0.0,0.875
104,505.571429,0.875,0.0,0.875
105,715.25,1.0,0.46875,0.53125


###### Output N-back subject-level data

In [9]:
fpath = derivs_dir / 'nback_subject_level.csv'
nback_subs.to_csv(fpath)

### ProcSpd data
Cleanup: Rehomogenize subject column, concatenate all frames, drop unneeded columns and rows

In [10]:
for df in ep_frames:
    df['Subject'] = df['Subject'].iloc[-1]
procspd_trials = pd.concat(ep_frames)

In [11]:
procspd_trials = procspd_trials[procspd_trials['Procedure']=='TrialProc']
procspd_trials = procspd_trials[[
    'Subject','TargetStimulus.RT','TargetStimulus.OnsetToOnsetTime','Buffer.RT'
]]

###### Combine response windows for final RT

In [12]:
def calculate_rt(row):
    initial = row['TargetStimulus.RT']
    buffert = row['Buffer.RT']
    initial_duration = row['TargetStimulus.OnsetToOnsetTime']
    if initial == 0 and buffert > 0:
        rt = buffert + initial_duration
    elif initial > 0: rt = initial
    else: rt = np.nan
    return(rt)

procspd_trials['RT'] = procspd_trials.apply(calculate_rt,axis=1)

###### Output Procspd trial-level data

In [13]:
fpath = derivs_dir / 'procspd_trial_level.csv'
procspd_trials.to_csv(fpath,index=False)

#### Group and output ProcSpd subject-level data

In [14]:
grouped = procspd_trials.groupby('Subject')
procspd_subs = grouped.mean()[['RT']]

In [15]:
fpath = derivs_dir / 'procspd_subject_level.csv'
procspd_subs.to_csv(fpath)

# Patching in corrected exclusion of outlier trials

Hard patch to correctly exclude individual outlier trials at 0.00135/0.99865 quantiles

In [16]:
from pathlib import Path

import pandas as pd
import numpy as np

import sklearn as skl
from scipy import stats

import seaborn as sns

In [17]:
%matplotlib inline
sns.set_style('white')

In [18]:
derivs_dir = Path().absolute().parents[1] / 'derivatives'
nb_trials_fpath = derivs_dir / '0.3.cleaned' / 'nback_trial_level.csv'
ps_trials_fpath = derivs_dir / '0.3.cleaned' / 'procspd_trial_level.csv'

In [19]:
nb_trials = pd.read_csv(nb_trials_fpath)
ps_trials = pd.read_csv(ps_trials_fpath)

Best method so far below: 

- quantile method should be appropriate but I cannot find the right value for 3 SD
- use hits only when calculating bounds, or use all trials? `base1` vs `base2`

In [20]:
from outliers import group_exclude

### N-back bound RT:

In [21]:
nb_trials['RTbound'] = group_exclude(nb_trials, 'sub', 'RT')
nb_hits = nb_trials.loc[nb_trials['HIT'] == 1]

In [22]:
nb_rt = (nb_hits.groupby('sub')
                .mean()['RTbound']
                .rename('nback_RT'))

In [23]:
nb_sub_fpath = derivs_dir / '0.3.cleaned' / 'nback_subject_level.csv'
(pd.read_csv(nb_sub_fpath)
     .merge(nb_rt.reset_index())).to_csv(nb_sub_fpath, index=False)

### ProcSpeed bound RT:

In [24]:
ps_trials['RTbound'] = group_exclude(ps_trials, 'Subject', 'RT')

In [25]:
ps_RT = (ps_trials.groupby('Subject')
                  .mean()['RTbound']
                  .rename('procspd_RT'))

In [26]:
ps_sub_fpath = derivs_dir / '0.3.cleaned' / 'procspd_subject_level.csv'
(pd.read_csv(ps_sub_fpath)
     .merge(ps_RT.reset_index())).to_csv(ps_sub_fpath, index=False)

# Next step
## Join all subject-level data
Now that the subject-level data is cleaned & computed for ...
- Survey measures
- Comprehension
- N-back
- Processing speed

... we can combine all of that data into our final subject-level data set: [0.4.join_subject_level.ipynb](0.4.join_subject_level.ipynb)