In [1]:
import json
import numpy as np
import pandas as pd
from sympy.solvers import solve
from sympy import Symbol
import scipy.stats as sstats

from utils import SimulateData
from stopsignalmetrics import SSRTmodel, StopData

# __Preprocessing__

## 1. Prepare data for use by stopsignalmetrics

In [2]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_raw.csv')

In [3]:
for rt_col in ['go_rt_adjusted', 'stop_rt_adjusted']:
    abcd_data.loc[abcd_data['finger_press'].isnull(), rt_col] = np.nan

In [4]:
abcd_data['block'] = np.NaN
abcd_data.loc[abcd_data['TrialNum'] < 180, 'block'] = '1'
abcd_data.loc[abcd_data['TrialNum'] >= 180, 'block'] = '2'

In [5]:
abcd_data['choice_accuracy'] = np.where(
    abcd_data['finger_press'].notnull(),
    np.where(
        abcd_data['finger_press']==abcd_data['correct_response'],
        1,
        0),
    np.nan
)

In [6]:
abcd_data.to_csv('abcd_data/minimal_abcd_with_issue_3.csv', index=False)

## 2. Drop Issue 3 people

In [7]:
abcd_data_w_issue_3 = pd.read_csv('abcd_data/minimal_abcd_with_issue_3.csv')

In [8]:
issue_3_people = abcd_data_w_issue_3.loc[(abcd_data_w_issue_3['stop_rt_adjusted'] < 50) & (abcd_data_w_issue_3['stop_rt_adjusted'] > 0) & (abcd_data_w_issue_3['SSDDur'] ==50), 'NARGUID'].unique()

print('n affected:', len(issue_3_people))
print('p affect:', len(issue_3_people)/ 7231)

abcd_data = abcd_data_w_issue_3[~abcd_data_w_issue_3.NARGUID.isin(issue_3_people)].copy()
print('n remaining:', abcd_data.NARGUID.nunique())

n affected: 197
p affect: 0.027243811367722307
n remaining: 7034


In [9]:
abcd_data.to_csv('abcd_data/minimal_abcd_clean.csv', index=False)

In [43]:
abcd_data.query("NARGUID == '8VA4L6RD'")

Unnamed: 0,NARGUID,go_rt_adjusted,stop_rt_adjusted,trial_type,SSDDur,correct_go_response,correct_stop,SSD.RESP,Fix.RESP,StopSignal.RESP,Go.RESP,TrialNum,correct_stimulus_mapping_1,correct_stimulus_mapping_2,block,finger_press,correct_response,choice_accuracy
703440,8VA4L6RD,,,GoTrial,,0.0,,,,,,1,4.0,1.0,1,,4.0,
703441,8VA4L6RD,920.0,,GoTrial,,0.0,,,,,1.0,2,3.0,2.0,1,4.0,3.0,0.0
703442,8VA4L6RD,1445.0,,GoTrial,,1.0,,,2.0,,,3,3.0,2.0,1,3.0,3.0,1.0
703443,8VA4L6RD,,,StopTrial,50.0,,1.0,,,,,4,3.0,2.0,1,,3.0,
703444,8VA4L6RD,725.0,,GoTrial,,1.0,,,,,1.0,5,4.0,1.0,1,4.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703795,8VA4L6RD,,,GoTrial,,0.0,,,,,,356,4.0,1.0,2,,4.0,
703796,8VA4L6RD,,,GoTrial,,0.0,,,,,,357,3.0,2.0,2,,3.0,
703797,8VA4L6RD,,,GoTrial,,0.0,,,,,,358,4.0,1.0,2,,4.0,
703798,8VA4L6RD,,,StopTrial,900.0,,1.0,,,,,359,4.0,1.0,2,,4.0,


# __Metrics for Simulation__

## 1. Find mu_go and mu_stop for each individual using their mean Go RT and their SSRT

In [10]:
variable_dict = {
   "columns": {
      "ID": "NARGUID", #subject identifier
      "condition": "trial_type", #col with trial types 
      "correct_response": "correct_response", #col with correct reponse codes
      "response": "finger_press", #col with actual response codes 
      "SSD": "SSDDur", #col with stop signal delay 
      "block": "block", #col with which block a trial is accuring during
      "goRT": "go_rt_adjusted", # col with go reaction time recording 
      "stopRT": "stop_rt_adjusted", #col with stop failure reaction time recording
      "choice_accuracy": "choice_accuracy" #col with whether a response was correct
   },
   "key_codes": {
      "go": "GoTrial", # cell values for go trials  
      "stop": "StopTrial",  #cell values for stop trials 
      "correct": 1.0,
       "incorrect": 0.0,
       "noResponse": np.nan
   }
}

In [51]:
abcd_ssrt = StopData(var_dict=variable_dict, compute_acc_col=False)

abcd_proc = abcd_ssrt.fit_transform(abcd_data) 

ssrt_model = SSRTmodel(model='replacement')

ssrt_metrics = ssrt_model.fit_transform(abcd_proc, level='group')

problem_subs = ssrt_metrics[ssrt_metrics.SSRT.isnull()].index

print(f'dropping {len(problem_subs)} subs for having P(respond|signal) == 1 or 0')

ssrt_metrics = ssrt_metrics[ssrt_metrics.SSRT.notnull()].copy()

dropping 3 subs for having P(respond|signal) == 1 or 0


In [52]:
v

Index(['8VA4L6RD', 'AF517NF3', 'U1THG28C'], dtype='object', name='ID')

In [12]:
def get_mus(sub_row):
    sub_row = sub_row.copy()
    mu_dict = {}
    # init go and stop vars
    g = Symbol('g')
    s = Symbol('s')
    
    # formulas (must be solved as "expression = 0")
    # subject_{go/ss}rt = threshold/mu_go + nondecision_time
    # threshold = 100
    # nondecision_time = 50
    
    go_sol = solve(100/g + 50 - sub_row['mean_go_RT'].values[0], g)
    assert len(go_sol) == 1, f"{len(go_sol)} solutions found based on {sub_row['mean_go_RT']}: {go_sol}"
    mu_dict['go'] = float(go_sol[0])
    
    stop_sol = solve(100/s + 50 - sub_row['SSRT'].values[0], s)
    assert len(stop_sol) == 1, f"{len(stop_sol)} solutions found based on {sub_row['SSRT']}: {stop_sol}"
    mu_dict['stop'] = float(stop_sol[0])
    
    return mu_dict

mu_df = ssrt_metrics.groupby('ID').apply(get_mus)
mu_dict = mu_df.to_dict()
mu_dict['prob_subs'] = list(problem_subs)

In [56]:
json_mu_path = 'abcd_data/individual_mus.json'
with open(json_mu_path, 'w') as jp:
    json.dump(mu_dict, jp)

## 1. get SSD distributions per subject

In [15]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')

In [16]:
SSD_dist = abcd_data.groupby('NARGUID')['SSDDur'].value_counts(normalize=True)
SSD_dist.name = 'proportion'
SSD_dist = SSD_dist.reset_index()

In [17]:
SSD_dist.to_csv('abcd_data/SSD_dist_by_subj.csv', index=False)

## 2. P(guess|SSD) for mixture distributions

In [18]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')
SSDs = abcd_data.SSDDur.unique()
SSDs = [i for i in SSDs if i == i]
SSDs.sort()

In [19]:
acc_per_SSD = pd.DataFrame()
for ssd in SSDs:
    curr_means = abcd_data.query(
        "SSDDur == %s and correct_stop==0.0" % ssd
    ).groupby('NARGUID').mean()['choice_accuracy']
    curr_means.name = ssd
    acc_per_SSD = pd.concat([acc_per_SSD, curr_means], 1, sort=True)

go_accs = abcd_data.query(
        "trial_type == 'GoTrial' and correct_go_response in [1.0, 0.0]"
    ).groupby('NARGUID').mean()['choice_accuracy']
go_accs.name = -1
acc_per_SSD = pd.concat([acc_per_SSD, go_accs], 1, sort=True)

In [20]:
p = Symbol('p')
guess_mean = acc_per_SSD.mean()[0.0]
go_mean = acc_per_SSD.mean()[-1]
p_guess_per_SSD = {}
for ssd in SSDs:
    curr_mean = acc_per_SSD.mean()[ssd]
    solution = solve(p*guess_mean + (1-p)*go_mean - curr_mean, p)
    assert len(solution) == 1
    p_guess_per_SSD[ssd] = solution[0]
p_guess_df = pd.DataFrame(p_guess_per_SSD, index=['p_guess'])
p_guess_df.to_csv('abcd_data/p_guess_per_ssd.csv', index=False)

In [21]:
{col: float(p_guess_df[col].values[0]) for col
                      in p_guess_df.columns}

{0.0: 1.0,
 50.0: 0.8453013488292931,
 100.0: 0.6106228257832641,
 150.0: 0.45284195418821765,
 200.0: 0.29414701140670046,
 250.0: 0.20099792299390123,
 300.0: 0.14331766365641904,
 350.0: 0.07989464664779473,
 400.0: 0.05646833343690329,
 450.0: 0.0271712158267143,
 500.0: 0.01912138134256117,
 550.0: -0.0013543997871993934,
 600.0: 0.005733602200019834,
 650.0: -0.0034938217706060315,
 700.0: 0.07486036991100674,
 750.0: 0.035672034056798016,
 800.0: 0.18566915102381398,
 850.0: 0.15647354045123857,
 900.0: 0.2691110670854924}

In [22]:
p_guess_df2 = pd.read_csv('abcd_data/p_guess_per_ssd.csv')

In [23]:
p_guess_df2

Unnamed: 0,0.0,50.0,100.0,150.0,200.0,250.0,300.0,350.0,400.0,450.0,500.0,550.0,600.0,650.0,700.0,750.0,800.0,850.0,900.0
0,1.0,0.845301,0.610623,0.452842,0.294147,0.200998,0.143318,0.079895,0.056468,0.027171,0.019121,-0.001354,0.005734,-0.003494,0.07486,0.035672,0.185669,0.156474,0.269111


In [24]:
{float(col): float(p_guess_df[col].values[0]) for col
                      in p_guess_df.columns}[0.0]

1.0

## 3. Inhibition function (p(respond|SSD))

In [25]:
def get_p_resp_per_SSD(data, SSDs):
    data = data.copy()
    out_dict = {}
    for ssd in SSDs:
        curr_data = data.query(
            "SSDDur == %s" % ssd
        )
        if len(curr_data) == 0:
            out_dict[ssd] = np.nan
        else:
            out_dict[ssd] = len(curr_data.query("correct_stop == 0.0")) / len(curr_data)
    return out_dict

In [26]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')
SSDs = [i for i in abcd_data.SSDDur.unique() if i==i]
ssd_resp_dict = abcd_data.groupby('NARGUID').apply(get_p_resp_per_SSD, SSDs)
ssd_resp_df = ssd_resp_dict.apply(pd.Series)

In [27]:
ssd_resp_melt = ssd_resp_df.reset_index().melt(id_vars='NARGUID', value_vars=ssd_resp_df.columns, var_name='SSD', value_name='p_respond' )
ssd_resp_melt['underlying distribution'] = 'ABCD data'
ssd_resp_melt.to_csv('abcd_data/abcd_inhib_func_per_sub.csv', index=False)

In [28]:
abcd_inhib_func = pd.DataFrame(ssd_resp_df.mean())
abcd_inhib_func.index.name = 'SSD'
abcd_inhib_func.columns = ['p_respond']
abcd_inhib_func = abcd_inhib_func.reset_index()
abcd_inhib_func['underlying distribution'] = 'ABCD data'

In [29]:
abcd_inhib_func.to_csv('abcd_data/abcd_inhib_func.csv', index=False)

## 4. build run_cmds.sh

In [30]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')

In [31]:
narguids = abcd_data.NARGUID.unique()

In [32]:
nsubs_per_job = 48

with open('run_all_sims.sh', 'w') as f:
    for start_idx in range(0, len(narguids), nsubs_per_job):
        end_idx = start_idx + nsubs_per_job
        if end_idx > len(narguids):
            end_idx = len(narguids)
        substr = ' '.join(narguids[start_idx:end_idx])
        f.write(f'python simulate_individuals.py --subjects {substr}\n')

In [33]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1
file_len('run_all_sims.sh')

147