In [1]:
%matplotlib inline

# scientific computing and plotting
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

# HDDM related packages
import pymc as pm
import hddm
import kabuki
import arviz as az
print("The current HDDM version is: ", hddm.__version__)
print("The current kabuki version is: ", kabuki.__version__)
print("The current PyMC version is: ", pm.__version__)
print("The current ArviZ version is: ", az.__version__)
from kabuki.analyze import check_geweke
from kabuki.analyze import gelman_rubin

# miscellaneous
from IPython.display import clear_output

The current HDDM version is:  1.0.1RC
The current kabuki version is:  0.6.5RC4
The current PyMC version is:  2.3.8
The current ArviZ version is:  0.15.1


# Load & Manipulate Data

In [7]:
# NOT RUN: data manipulation -- concatenate data across datasets, store into a new file

# need to make sure the directory containing data is downloaded to local
# otherwise will encounter the error "resource deadlock avoided"

# data_UIC_CAR = hddm.load_csv("../all_cleaned/CAR_UIC_nback_cleaned.csv")
# data_USC_CAR = hddm.load_csv("../all_cleaned/CAR_USC_nback_cleaned.csv")
# data_UIC_PARC = hddm.load_csv("../all_cleaned/PARC_UIC_nback_cleaned.csv")
# data_USC_PARC = hddm.load_csv("../all_cleaned/PARC_USC_nback_cleaned.csv")

# data_UIC_CAR["school"] = "UIC"
# data_USC_CAR["school"] = "USC"
# data_UIC_PARC["school"] = "UIC"
# data_USC_PARC["school"] = "USC"

# data_CAR = pd.concat((data_UIC_CAR, data_USC_CAR), axis=0)
# MDD = pd.read_csv("../MDD_all.csv")
# MDD = MDD.rename(columns={"ID": "subject_id", "MDDnew": "group"})
# MDD.loc[MDD.group == 0, 'group'] = "HC"
# MDD.loc[MDD.group == 1, 'group'] = "rMDD"
# data_CAR = pd.merge(data_CAR, MDD, on = "subject_id")

# data_PARC = pd.concat((data_UIC_PARC, data_USC_PARC), axis=0)
# data_PARC["group"] = "SI"
# data = pd.concat((data_CAR, data_PARC), axis=0)

# data.to_csv("../all_cleaned/nback_all.csv", index=False)

In [9]:
# load data
data = hddm.load_csv("../all_cleaned/nback_all.csv")

In [8]:
# exclude rows according to the exclusion flags
data_clean = data.loc[(data.exclude_trial == 0) & (data.exclude_part == 0), :].reset_index()

In [28]:
print("Number of trials, before exclusion: ", data.shape[0])
print("Number of trials, after exclusion: ", data_clean.shape[0])
print("Number of trials per person: ", data_clean.loc[data_clean.subject_id==data_clean.subject_id[0]].shape[0])
print("Number of variables: ", data_clean.shape[1])
print("Number of participants, before exclusion: ", data.subject_id.unique().shape[0])
print("Number of participants, after exclusion: ", data_clean.subject_id.unique().shape[0])

Number of trials, before exclusion:  15630
Number of trials, after exclusion:  15398
Number of trials per person:  118
Number of variables:  16
Number of participants, before exclusion:  131
Number of participants, after exclusion:  131


In [29]:
# rename columns according to the HDDM model convention
data_clean = data_clean.rename(columns={"resp_rt": "rt", "resp_value": "response",
                                        "trial_congruent_fac": "trial_type",
                                        "subject_id": "subj_idx"})

In [31]:
data_clean = data_clean.set_index('index')

In [32]:
data_clean.head()

Unnamed: 0_level_0,subj_idx,block_name,trial_location,trial_similarity,trial_corr_resp,resp_corr,rt,response,mean_rt,sd_rt,exclude_thresh,exclude_trial,exclude_part,school,group
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,1903,nb1TestingBlock,11,0,left,1,0.663306,left,0.773041,0.177657,3,0,0,UIC,rMDD
1,1903,nb1TestingBlock,7,4,right,1,0.606543,right,0.773041,0.177657,3,0,0,UIC,rMDD
2,1903,nb1TestingBlock,8,1,right,1,1.037526,right,0.773041,0.177657,3,0,0,UIC,rMDD
3,1903,nb1TestingBlock,12,4,right,1,0.682756,right,0.773041,0.177657,3,0,0,UIC,rMDD
4,1903,nb1TestingBlock,12,0,left,1,0.682443,left,0.773041,0.177657,3,0,0,UIC,rMDD


# Variable Coding

In [33]:
# flip RTs of incorrect trials in the negative direction
data_clean.loc[data_clean.resp_corr == 0, 'rt'] = data_clean.loc[data_clean.resp_corr == 0, 'rt']*(-1)

In [34]:
data_clean.block_name

index
0       nb1TestingBlock
1       nb1TestingBlock
2       nb1TestingBlock
3       nb1TestingBlock
4       nb1TestingBlock
             ...       
1675    nb2TestingBlock
1676    nb2TestingBlock
1677    nb2TestingBlock
1678    nb2TestingBlock
1679    nb2TestingBlock
Name: block_name, Length: 15398, dtype: object