In [1]:
import numpy as np
import pandas as pd
from os import path
from collections import defaultdict
from scipy.io import loadmat

# imports for data loading, processing and analysis
from utils.io import get_args, get_firstlevel_dir, save_args, load_data
from utils.preprocessing import set_get_timeseries, build_sub_run_df

  warn('The nilearn.glm module is experimental. '


In [2]:
# Set rng for replication
np.random.seed(2022)

In [3]:
args = get_args(['-dataset', '2'])

# Set up directories, load data
results_dir = get_firstlevel_dir(args)
save_args(args, results_dir)

ridx, cidx = np.tril_indices(args.nROI, -1)
get_timeseries = set_get_timeseries(args, ridx, cidx)  # set the type of timeseries to use, ROI or edge

timeseries_data, sub_file_map, sublist, ntr_tossed = load_data(args)

In [4]:
def add_trial_types(sub_run_df):
    sub_run_df.insert(0, 'modulation', 1)
    sub_run_df.insert(0, 'duration', 1)
    sub_run_df.insert(0, 'onset', sub_run_df[0])
    sub_run_df.insert(0, 'trial_type', None)
    (rare_cond,common_cond) = (1,2) if len(sub_run_df.query("condition==1"))<len(sub_run_df.query("condition==2")) else (2,1)


    sub_run_df.loc[(sub_run_df.condition==rare_cond) & (sub_run_df.acc==1), 'trial_type'] = 'probe_success'
    sub_run_df.loc[(sub_run_df.condition==rare_cond) & (sub_run_df.acc==-1), 'trial_type'] = 'probe_fail'
    sub_run_df.loc[(sub_run_df.condition==common_cond) & (sub_run_df.acc==-1), 'trial_type'] = 'common_fail'
    
    return sub_run_df

def get_preceding_trs(row, times):
    if row['first_preceding_onset'] is None:
        return None
    else:
        return times[(times < row['onset']) & (times >= row['first_preceding_onset'])]
    
def mean_across_preceding_trs(tr_arr, timeseries_df):
    if (tr_arr is None):
        return None
    else:
        return timeseries_df.loc[tr_arr, :].mean(0).values

In [5]:
# PREPROCESS BEHAV DATA, FIND PRETRIAL DATA
des_mat_dict = defaultdict(list)
for sidx, sub in enumerate(sublist):
    print('sub: ', sidx)
    sub_files = sub_file_map[sidx]

    for runidx, sub_file in enumerate(sub_files):        
        
        sub_trs = np.arange(timeseries_data[sidx][runidx].shape[0])*args.t_r
        
        skip_session = False
        
        if (int(args.dataset)==1) and (sidx==22) and (runidx==2): # SKIP RUN WITH NO CORRECT OMISSIONS (see QA_trial_counts)
            print(f'skipping sub {sidx} run {runidx}')
            skip_session = True
            pass
        
        sub_run_df = build_sub_run_df(loadmat(sub_file), ntr_tossed, args)
        curr_timeseries = get_timeseries(timeseries_data[sidx][runidx])

        if int(args.dataset)==2:
            bad_trs = np.argwhere(np.apply_along_axis(lambda x: np.isnan(x).all(), axis=1, arr=curr_timeseries)).flatten()

            if len(bad_trs)/curr_timeseries.shape[0] >= args.dataset2_censor_thresh: # skip subjects with too many censored timepoints
                print(f'skipping sub {sidx} run {runidx}')
                skip_session = True
                pass
            
        sub_run_df = add_trial_types(sub_run_df)
        
        # Use sub_run trial onsets to interpolate timeseries
        timeseries_df = pd.DataFrame(curr_timeseries)
        timeseries_df.insert(0, 'onset', np.arange(len(timeseries_df)))
        timeseries_df = pd.concat([timeseries_df, sub_run_df[['onset']]])
        timeseries_df = timeseries_df.sort_values(by='onset').set_index('onset')
        timeseries_df = timeseries_df.interpolate('slinear')

        trs_ext = timeseries_df.index.values
        
        # Subset to relevant conditions, find preceding TRs, and mean across them
        sub_run_df['first_preceding_onset'] = None
        sub_run_df.loc[args.nTrials_preceding:, 'first_preceding_onset'] = sub_run_df.iloc[:-args.nTrials_preceding]['onset'].values

        sub_run_df = sub_run_df[sub_run_df.trial_type.notnull()].reset_index(drop=True)
        
        sub_run_df.loc[:, 'preceding_trs'] = sub_run_df.apply(lambda x: get_preceding_trs(x, timeseries_df.index.values), axis=1)
        sub_run_df.loc[:, 'edge_means_of_preceding_trs'] = sub_run_df.preceding_trs.apply(lambda x: mean_across_preceding_trs(x, timeseries_df))

        if not skip_session:
            des_mat_dict[sub].append(sub_run_df)

sub:  0
sub:  1
sub:  2
sub:  3
sub:  4
sub:  5
sub:  6
sub:  7
sub:  8
sub:  9
sub:  10
sub:  11
sub:  12
sub:  13
sub:  14
sub:  15
sub:  16
sub:  17
sub:  18
sub:  19
sub:  20
sub:  21
sub:  22
skipping sub 22 run 2
sub:  23
sub:  24


In [6]:
if args.replicate:
    Diff_matrix = np.full((len(sublist), args.nROI), np.nan)
else:
    Diff_matrix = np.full((len(sublist), len(ridx)), np.nan)
for sid, sub in enumerate(des_mat_dict):
    full_sub_df = pd.concat(des_mat_dict[sub])
    full_sub_df = full_sub_df[full_sub_df.edge_means_of_preceding_trs.notnull()]
    CO_means = np.vstack(full_sub_df.query('trial_type=="probe_success"').edge_means_of_preceding_trs.values)
    CE_means = np.vstack(full_sub_df.query('trial_type=="probe_fail"').edge_means_of_preceding_trs.values)

    Diff_matrix[sid, :] = np.nanmean(CO_means, 0) - np.nanmean(CE_means, 0) # sub7 (sidx=6) has one probe fail that occurred after scan had stopped (822s)
    
pd.DataFrame(Diff_matrix).to_csv(path.join(results_dir, f'model-trialPrecursor_datatype-{"roi" if args.use_rois else "edge"}.csv'), index=False)