In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
derivs_dir = os.path.join('..','derivatives')
scales_dir = os.path.join(derivs_dir,'qualtrics','2.subscaled')
output_dir = os.path.join(derivs_dir,'05.subject-level')

In [3]:
try:os.mkdir(output_dir)
except WindowsError as e: print(e)

[WinError 183] Cannot create a file when that file already exists: '..\\derivatives\\05.subject-level'


# Import data

In [4]:
fname=os.path.join(derivs_dir,'sub-all_task-all_VALUES.xlsx')
behav_data = pd.read_excel(fname).rename(columns={'subjnum':'ssid'})

In [5]:
fname=os.path.join(scales_dir,'all_subscales.csv')
scale_data = pd.read_csv(fname)

In [6]:
behav_data.head()

Unnamed: 0,study,ssid,block,trial,domain,binary-domain,estimation,trueprob,val-estdiff,val-estdiff-valid,...,facemem-split-123,facemem-median-123,choice-split-12,outcome-split-12,choice-split-123,outcome-split-123,choice-median-12,outcome-median-12,choice-median-123,outcome-median-123
0,1,100,1,1,LOSS,-1,50,0.3,0.2000001,0.2000001,...,-1.0,2,1.0,1.0,1.0,1.0,2.899986,2.378043,2.798163,2.296676
1,1,100,1,2,LOSS,-1,30,0.155172,0.1448277,0.1448277,...,1.0,2,,1.0,,1.0,,,2.798163,2.296676
2,1,100,1,3,LOSS,-1,30,0.3,1.038193e-07,1.038193e-07,...,,2,1.0,1.0,1.0,1.0,,,2.798163,2.296676
3,1,100,1,4,LOSS,-1,40,0.5,-0.1,-0.1,...,,2,1.0,1.0,1.0,1.0,,,2.798163,2.296676
4,1,100,1,5,LOSS,-1,60,0.7,-0.1000001,-0.1000001,...,-1.0,2,-1.0,1.0,-1.0,1.0,,,2.798163,2.296676


In [7]:
scale_data.head()

Unnamed: 0,ssid,ZIP,Relative Sleep,Relative Stress,Financial Difficulty,PAS,NAS,BAS Drive,BAS Fun Seeking,BAS Reward Responsiveness,BIS,Intuitive DMS,Rational DMS,Dependent DMS,Spontaneous DMS,Avoidant DMS,Financial Literacy
0,201,233,-1.0,1.0,1,32.0,17.0,12.0,8.0,20.0,20.0,18.0,22.0,21.0,11.0,16.0,1.0
1,202,34109,0.0,-1.0,0,39.0,10.0,13.0,15.0,20.0,16.0,24.0,25.0,8.0,15.0,9.0,1.0
2,203,33071,-1.0,0.0,0,33.0,13.0,13.0,13.0,20.0,25.0,18.0,20.0,25.0,15.0,23.0,1.0
3,146,32828,0.0,0.0,0,36.0,15.0,12.0,11.0,17.0,21.0,16.0,22.0,17.0,7.0,13.0,0.0
4,205,33027,-1.0,-1.0,1,26.0,25.0,14.0,12.0,19.0,24.0,23.0,23.0,22.0,14.0,14.0,2.0


# Normalize PANAS and BISBAS subscales

In [8]:
keys_to_normalize = [
    'PAS','NAS','BIS','BAS_fs','BAS_rr','BAS_dr','DMS_i','DMS_r','DMS_d','DMS_s','DMS_a',
]

In [9]:
df = scale_data.copy(deep=True)
df.columns=[
    'ssid','zip','sleep','stress','fin_dif','PAS','NAS','BAS_dr','BAS_fs','BAS_rr','BIS',
    'DMS_i','DMS_r','DMS_d','DMS_s','DMS_a','fin_lit'
]

In [10]:
df['study'] = df['ssid'].astype(str).str[0]

In [11]:
def normalize(row):
    study = row['study']
    val = row[key]
    group_mean = df.groupby('study').mean().loc[study,key]
    group_std = df.groupby('study').std().loc[study,key]
    zval = (val - group_mean) / group_std
    return zval

In [12]:
zkeys=[]
for key in keys_to_normalize:
    zkey = 'z_'+ key
    zkeys.append(zkey)
    df[zkey] = df.apply(normalize,axis=1)

In [13]:
df.head()

Unnamed: 0,ssid,zip,sleep,stress,fin_dif,PAS,NAS,BAS_dr,BAS_fs,BAS_rr,...,z_NAS,z_BIS,z_BAS_fs,z_BAS_rr,z_BAS_dr,z_DMS_i,z_DMS_r,z_DMS_d,z_DMS_s,z_DMS_a
0,201,233,-1.0,1.0,1,32.0,17.0,12.0,8.0,20.0,...,0.719199,-0.328343,-1.917315,0.954217,0.208641,-0.033542,0.434326,0.537998,-0.586269,0.494748
1,202,34109,0.0,-1.0,0,39.0,10.0,13.0,15.0,20.0,...,-0.830939,-1.429557,1.24916,0.954217,0.654561,1.653886,1.494209,-2.719881,0.396859,-0.839148
2,203,33071,-1.0,0.0,0,33.0,13.0,13.0,13.0,20.0,...,-0.166594,1.048173,0.344453,0.954217,0.654561,-0.033542,-0.272264,1.540423,0.396859,1.828644
3,146,32828,0.0,0.0,0,36.0,15.0,12.0,11.0,17.0,...,0.778937,0.103462,-0.924831,-0.601536,-0.004402,-0.521983,0.606033,-0.49336,-1.632707,-0.128144
4,205,33027,-1.0,-1.0,1,26.0,25.0,14.0,12.0,19.0,...,2.490786,0.77287,-0.107901,0.316121,1.100481,1.372648,0.78762,0.788605,0.151077,0.113635


In [14]:
columns = zkeys + ['ssid']
norms = df[columns]
norms.head()

Unnamed: 0,z_PAS,z_NAS,z_BIS,z_BAS_fs,z_BAS_rr,z_BAS_dr,z_DMS_i,z_DMS_r,z_DMS_d,z_DMS_s,z_DMS_a,ssid
0,-0.060217,0.719199,-0.328343,-1.917315,0.954217,0.208641,-0.033542,0.434326,0.537998,-0.586269,0.494748,201
1,0.87745,-0.830939,-1.429557,1.24916,0.954217,0.654561,1.653886,1.494209,-2.719881,0.396859,-0.839148,202
2,0.073735,-0.166594,1.048173,0.344453,0.954217,0.654561,-0.033542,-0.272264,1.540423,0.396859,1.828644,203
3,0.64158,0.778937,0.103462,-0.924831,-0.601536,-0.004402,-0.521983,0.606033,-0.49336,-1.632707,-0.128144,146
4,-0.863932,2.490786,0.77287,-0.107901,0.316121,1.100481,1.372648,0.78762,0.788605,0.151077,0.113635,205


### Natural Logarithm of subscales

Natural log of negative values is undefined. Ask KF why we want these and what to do?

` log(abs(x))*-1 ` ?

In [15]:
logs = np.log(norms)
logs.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,z_PAS,z_NAS,z_BIS,z_BAS_fs,z_BAS_rr,z_BAS_dr,z_DMS_i,z_DMS_r,z_DMS_d,z_DMS_s,z_DMS_a,ssid
0,,-0.329617,,,-0.046864,-1.567138,,-0.833961,-0.6199,,-0.703707,5.303305
1,-0.130735,,,0.222471,-0.046864,-0.42379,0.503128,0.401597,,-0.924173,,5.308268
2,-2.607274,,0.047049,-1.065799,-0.046864,-0.42379,,,0.432057,-0.924173,0.603575,5.313206
3,-0.443821,-0.249825,-2.26855,,,,,-0.50082,,,,4.983607
4,,0.912598,-0.257645,,-1.15163,0.095748,0.316742,-0.238739,-0.23749,-1.889965,-2.174767,5.32301


# 3sd trial exclusions

In [16]:
behav_data.columns

Index(['study', 'ssid', 'block', 'trial', 'domain', 'binary-domain',
       'estimation', 'trueprob', 'val-estdiff', 'val-estdiff-valid',
       'val-estdiff-valid-mc-12', 'val-estdiff-valid-mc-123', 'abs-estdiff',
       'abs-estdiff-valid', 'abs-estdiffvalid-mc-12',
       'abs-estdiff-valid-mc-123', 'choicert', 'choicert-3sd-12',
       'choicert-3sd-123', 'choicert-mc-12', 'choicert-mc-123', 'esttaskrt',
       'esttaskrt-3sd-12', 'esttaskrt-3sd-123', 'outcomert',
       'outcomert-3sd-12', 'outcomert-3sd-123', 'outcomert-mc-12',
       'outcomert-mc-123', 'stockchosen', 'waschoiceoptimal',
       'optimalchoicewas', 'stockvalue', 'abs-stockvalue',
       'bin-abs-stockvalue', 'bin-abs-stockvalue-mc-12',
       'bin-abs-stockvalue-mc-123', 'b4choiceprob', 'stockpic', 'bondpic',
       'facepic', 'bondmem', 'stockmem', 'facemem', 'facemem-clean-reverse',
       'facemem-mc-123', 'facemem-split-123', 'facemem-median-123',
       'choice-split-12', 'outcome-split-12', 'choice-split-12

In [17]:
df=behav_data[['ssid','block','trial','domain','estimation','trueprob','val-estdiff-valid']].rename(
    columns={'val-estdiff-valid':'val_estdiff_valid'}
)

In [18]:
subj_3sd = df.groupby('ssid').std()['val_estdiff_valid'] * 3
subj_3sd.head()

ssid
100    0.548469
101    0.262039
102    0.358415
103    0.620042
104    0.809025
Name: val_estdiff_valid, dtype: float64

In [19]:
subj_means = df.groupby('ssid').mean()['val_estdiff_valid']
subj_means.head()

ssid
100    0.046724
101   -0.014427
102    0.014467
103    0.036173
104    0.070082
Name: val_estdiff_valid, dtype: float64

In [20]:
def exclude_3sd(row):
    ssid,val = row['ssid'],row['val_estdiff_valid']
    mean,bound = subj_means[ssid],subj_3sd[ssid]
    diff = abs(val - mean)
    if diff < bound:
        return(val)
    else: return(np.nan)

In [21]:
df['valError_3sd'] = df.apply(exclude_3sd,axis=1)
df.head()

Unnamed: 0,ssid,block,trial,domain,estimation,trueprob,val_estdiff_valid,valError_3sd
0,100,1,1,LOSS,50,0.3,0.2000001,0.2000001
1,100,1,2,LOSS,30,0.155172,0.1448277,0.1448277
2,100,1,3,LOSS,30,0.3,1.038193e-07,1.038193e-07
3,100,1,4,LOSS,40,0.5,-0.1,-0.1
4,100,1,5,LOSS,60,0.7,-0.1000001,-0.1000001


In [22]:
subj_means = df.groupby('ssid').mean()['valError_3sd']
subj_means = pd.DataFrame(subj_means).reset_index()
subj_means.head()

Unnamed: 0,ssid,valError_3sd
0,100,0.046724
1,101,-0.014427
2,102,0.012664
3,103,0.036173
4,104,0.070082


# gender-judgment trial exclusions

# subject-level means

In [23]:
domain_means = df.groupby(['ssid','domain']).mean().reset_index()
domain_means.head()

Unnamed: 0,ssid,domain,block,trial,trueprob,val_estdiff_valid,valError_3sd
0,100,GAIN,6.666667,37.5,0.333548,0.102563,0.102563
1,100,LOSS,6.333333,35.5,0.473004,-0.009115,-0.009115
2,101,GAIN,7.5,42.5,0.552187,0.003091,0.003091
3,101,LOSS,5.5,30.5,0.5,-0.031944,-0.031944
4,102,GAIN,8.0,45.5,0.420043,0.034123,0.034123


In [24]:
gain_ave_val_error = domain_means[domain_means['domain'] == 'GAIN'][['ssid','valError_3sd']]
gain_ave_val_error = gain_ave_val_error.set_index('ssid')

In [25]:
loss_ave_val_error = domain_means[domain_means['domain'] == 'LOSS'][['ssid','valError_3sd']]
loss_ave_val_error = loss_ave_val_error.set_index('ssid')

### Framing Normalization
We want to normalize for the way the value estimation question is framed.

We're going to multiply valError means by `1` for subjects who were estimating the probability that the stock is *good*, and multiply means by `-1` for subjects who were estimating the probability that the stock is *bad*.

100s: `*  1`

200s: `* -1`

300s: `*  1`

In [26]:
def normalize_frame(row):
    ssid,val = row.name,row['valError_3sd']
    ssid = str(ssid)
    if ssid.startswith('2'):
        val = val * -1
    return(val)

In [27]:
means_df = df.groupby('ssid').mean()[['valError_3sd']]
means_df[85:91]

Unnamed: 0_level_0,valError_3sd
ssid,Unnamed: 1_level_1
188,-0.015131
190,0.011036
191,0.031606
202,-0.026935
203,0.015476
204,0.022662


In [28]:
nf_valerror = pd.DataFrame(means_df.apply(normalize_frame,axis=1))
nf_valerror = nf_valerror.rename(columns={0:'nf_valError'})
nf_valerror[85:91]

Unnamed: 0_level_0,nf_valError
ssid,Unnamed: 1_level_1
188,-0.015131
190,0.011036
191,0.031606
202,0.026935
203,-0.015476
204,-0.022662


In [29]:
gain_ave_val_error[85:91]

Unnamed: 0_level_0,valError_3sd
ssid,Unnamed: 1_level_1
188,-0.005284
190,0.06981
191,0.018383
202,-0.022684
203,-0.02562
204,0.003329


In [30]:
nf_valerr_gain = pd.DataFrame(gain_ave_val_error.apply(normalize_frame,axis=1))
nf_valerr_gain = nf_valerr_gain.rename(columns={0:'nf_gainValError'})
nf_valerr_gain[85:91]

Unnamed: 0_level_0,nf_gainValError
ssid,Unnamed: 1_level_1
188,-0.005284
190,0.06981
191,0.018383
202,0.022684
203,0.02562
204,-0.003329


In [31]:
loss_ave_val_error[85:91]

Unnamed: 0_level_0,valError_3sd
ssid,Unnamed: 1_level_1
188,-0.024704
190,-0.047737
191,0.044094
202,-0.031186
203,0.056573
204,0.041995


In [32]:
nf_valerr_loss = pd.DataFrame(loss_ave_val_error.apply(normalize_frame,axis=1))
nf_valerr_loss = nf_valerr_loss.rename(columns={0:'nf_lossValError'})
nf_valerr_loss[85:91]

Unnamed: 0_level_0,nf_lossValError
ssid,Unnamed: 1_level_1
188,-0.024704
190,-0.047737
191,0.044094
202,0.031186
203,-0.056573
204,-0.041995


# output

In [33]:
output = pd.DataFrame({
    'ssid':list(subj_means['ssid']),
    'valError':list(subj_means['valError_3sd']),
})
output.head()

Unnamed: 0,ssid,valError
0,100,0.046724
1,101,-0.014427
2,102,0.012664
3,103,0.036173
4,104,0.070082


In [34]:
output = output.merge(gain_ave_val_error.rename(columns={'valError_3sd':'gainValError'}).reset_index())
output.head()

Unnamed: 0,ssid,valError,gainValError
0,100,0.046724,0.102563
1,101,-0.014427,0.003091
2,102,0.012664,0.034123
3,103,0.036173,0.097953
4,104,0.070082,0.133293


In [35]:
output = output.merge(loss_ave_val_error.rename(columns={'valError_3sd':'lossValError'}).reset_index())
output.head()

Unnamed: 0,ssid,valError,gainValError,lossValError
0,100,0.046724,0.102563,-0.009115
1,101,-0.014427,0.003091,-0.031944
2,102,0.012664,0.034123,-0.010058
3,103,0.036173,0.097953,-0.025607
4,104,0.070082,0.133293,0.006871


In [36]:
output = output.merge(nf_valerror.reset_index())
output = output.merge(nf_valerr_gain.reset_index())
output = output.merge(nf_valerr_loss.reset_index())
output[85:91]

Unnamed: 0,ssid,valError,gainValError,lossValError,nf_valError,nf_gainValError,nf_lossValError
85,188,-0.015131,-0.005284,-0.024704,-0.015131,-0.005284,-0.024704
86,190,0.011036,0.06981,-0.047737,0.011036,0.06981,-0.047737
87,191,0.031606,0.018383,0.044094,0.031606,0.018383,0.044094
88,202,-0.026935,-0.022684,-0.031186,0.026935,0.022684,0.031186
89,203,0.015476,-0.02562,0.056573,-0.015476,0.02562,-0.056573
90,204,0.022662,0.003329,0.041995,-0.022662,-0.003329,-0.041995


In [37]:
output['valWedge'] = abs(output['gainValError'] - output['lossValError'])
output['nf_valWedge'] = abs(output['nf_gainValError'] - output['nf_lossValError'])
output.head()

Unnamed: 0,ssid,valError,gainValError,lossValError,nf_valError,nf_gainValError,nf_lossValError,valWedge,nf_valWedge
0,100,0.046724,0.102563,-0.009115,0.046724,0.102563,-0.009115,0.111678,0.111678
1,101,-0.014427,0.003091,-0.031944,-0.014427,0.003091,-0.031944,0.035036,0.035036
2,102,0.012664,0.034123,-0.010058,0.012664,0.034123,-0.010058,0.044181,0.044181
3,103,0.036173,0.097953,-0.025607,0.036173,0.097953,-0.025607,0.12356,0.12356
4,104,0.070082,0.133293,0.006871,0.070082,0.133293,0.006871,0.126422,0.126422


In [38]:
output = output.merge(norms)
output.head()

Unnamed: 0,ssid,valError,gainValError,lossValError,nf_valError,nf_gainValError,nf_lossValError,valWedge,nf_valWedge,z_PAS,z_NAS,z_BIS,z_BAS_fs,z_BAS_rr,z_BAS_dr,z_DMS_i,z_DMS_r,z_DMS_d,z_DMS_s,z_DMS_a
0,100,0.046724,0.102563,-0.009115,0.046724,0.102563,-0.009115,0.111678,0.111678,1.197616,-0.617778,1.375766,0.987888,1.074172,-0.004402,1.346013,0.907394,1.091196,1.334255,-1.20788
1,101,-0.014427,0.003091,-0.031944,-0.014427,0.003091,-0.031944,0.035036,0.035036,0.502571,-0.966957,-0.914381,0.987888,0.515603,0.796675,0.545443,-1.503493,-0.229267,1.562483,1.167539
2,102,0.012664,0.034123,-0.010058,0.012664,0.034123,-0.010058,0.044181,0.044181,-1.026528,-0.966957,0.612383,0.987888,-0.042967,-0.805478,-1.322553,0.003312,-0.229267,0.421344,-0.991933
3,103,0.036173,0.097953,-0.025607,0.036173,0.097953,-0.025607,0.12356,0.12356,0.780589,0.08058,0.612383,0.987888,1.074172,1.197213,1.612869,1.208755,0.56301,0.649572,0.735644
4,104,0.070082,0.133293,0.006871,0.070082,0.133293,0.006871,0.126422,0.126422,-0.192474,0.778937,0.103462,1.466068,0.515603,0.396137,0.278587,-1.503493,-0.49336,1.334255,-0.344092


In [39]:
fname=os.path.join(output_dir,'subject-level.csv')
#columns = ['ssid','valError','gainValError','lossValError','valWedge'] + zkeys
output.to_csv(fname,index=False)