In [1]:
import numpy as np
import pandas as pd
from sympy.solvers import solve
from sympy import Symbol
import scipy.stats as sstats

from utils import SimulateData

# __Preprocessing__

## 1. Add choice accuracy column

In [2]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_raw.csv')

In [3]:
abcd_data['choice_accuracy'] = np.where(
    abcd_data['finger_press'].notnull(),
    np.where(
        abcd_data['finger_press']==abcd_data['correct_response'],
        1,
        0),
    np.nan
)

In [4]:
abcd_data.to_csv('abcd_data/minimal_abcd_with_issue_3.csv', index=False)

## 2. Drop Issue 3 people

In [5]:
abcd_data_w_issue_3 = pd.read_csv('abcd_data/minimal_abcd_with_issue_3.csv')

In [6]:
issue_3_people = abcd_data_w_issue_3.loc[(abcd_data_w_issue_3['stop_rt_adjusted'] < 50) & (abcd_data_w_issue_3['stop_rt_adjusted'] > 0) & (abcd_data_w_issue_3['SSDDur'] ==50), 'NARGUID'].unique()

print('n affected:', len(issue_3_people))
print('p affect:', len(issue_3_people)/ 7231)

abcd_data = abcd_data_w_issue_3[~abcd_data_w_issue_3.NARGUID.isin(issue_3_people)].copy()
print('n remaining:', abcd_data.NARGUID.nunique())

n affected: 197
p affect: 0.027243811367722307
n remaining: 7034


In [7]:
abcd_data.to_csv('abcd_data/minimal_abcd_clean.csv', index=False)

# __Metrics for Simulation__

## 1. get SSD distributions per subject

In [8]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')

In [9]:
SSD_dist = abcd_data.groupby('NARGUID')['SSDDur'].value_counts(normalize=True)
SSD_dist.name = 'proportion'
SSD_dist = SSD_dist.reset_index()

In [10]:
SSD_dist.to_csv('abcd_data/SSD_dist_by_subj.csv', index=False)

## 2. P(guess|SSD) for mixture distributions

In [11]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')
SSDs = abcd_data.SSDDur.unique()
SSDs = [i for i in SSDs if i == i]
SSDs.sort()

In [18]:
acc_per_SSD = pd.DataFrame()
for ssd in SSDs:
    curr_means = abcd_data.query(
        "SSDDur == %s and correct_stop==0.0" % ssd
    ).groupby('NARGUID').mean()['choice_accuracy']
    curr_means.name = ssd
    acc_per_SSD = pd.concat([acc_per_SSD, curr_means], 1, sort=True)

go_accs = abcd_data.query(
        "trial_type == 'GoTrial' and correct_go_response in [1.0, 0.0]"
    ).groupby('NARGUID').mean()['choice_accuracy']
go_accs.name = -1
acc_per_SSD = pd.concat([acc_per_SSD, go_accs], 1, sort=True)

In [43]:
p = Symbol('p')
guess_mean = acc_per_SSD.mean()[0.0]
go_mean = acc_per_SSD.mean()[-1]
p_guess_per_SSD = {}
for ssd in SSDs:
    curr_mean = acc_per_SSD.mean()[ssd]
    solution = solve(p*guess_mean + (1-p)*go_mean - curr_mean, p)
    assert len(solution) == 1
    p_guess_per_SSD[ssd] = solution[0]
p_guess_df = pd.DataFrame(p_guess_per_SSD, index=['p_guess'])
p_guess_df.to_csv('abcd_data/p_guess_per_ssd.csv', index=False)

In [42]:
list(p_guess_df.columns.astype(float))

[0.0,
 50.0,
 100.0,
 150.0,
 200.0,
 250.0,
 300.0,
 350.0,
 400.0,
 450.0,
 500.0,
 550.0,
 600.0,
 650.0,
 700.0,
 750.0,
 800.0,
 850.0,
 900.0]

In [41]:
list(p_guess_df.values.astype(float)[0])


[1.0,
 0.8453013488292931,
 0.6106228257832641,
 0.45284195418821765,
 0.29414701140670046,
 0.20099792299390123,
 0.14331766365641904,
 0.07989464664779473,
 0.05646833343690329,
 0.0271712158267143,
 0.01912138134256117,
 -0.0013543997871993934,
 0.005733602200019834,
 -0.0034938217706060315,
 0.07486036991100674,
 0.035672034056798016,
 0.18566915102381398,
 0.15647354045123857,
 0.2691110670854924]

## 3. Inhibition function (p(respond|SSD))

In [23]:
def get_p_resp_per_SSD(data, SSDs):
    data = data.copy()
    out_dict = {}
    for ssd in SSDs:
        curr_data = data.query(
            "SSDDur == %s" % ssd
        )
        if len(curr_data) == 0:
            out_dict[ssd] = np.nan
        else:
            out_dict[ssd] = len(curr_data.query("correct_stop == 0.0")) / len(curr_data)
    return out_dict

In [24]:
abcd_data = pd.read_csv('abcd_data/minimal_abcd_clean.csv')
SSDs = [i for i in abcd_data.SSDDur.unique() if i==i]
ssd_resp_dict = abcd_data.groupby('NARGUID').apply(get_p_resp_per_SSD, SSDs)
ssd_resp_df = ssd_resp_dict.apply(pd.Series)

In [25]:
abcd_inhib_func = pd.DataFrame(ssd_resp_df.mean())
abcd_inhib_func.index.name = 'SSD'
abcd_inhib_func.columns = ['p_respond']
abcd_inhib_func = abcd_inhib_func.reset_index()
abcd_inhib_func['underlying distribution'] = 'ABCD data'

In [26]:
abcd_inhib_func.to_csv('abcd_data/abcd_inhib_func.csv', index=False)