In [None]:
import os
from pathlib import Path

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [None]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')

In [None]:
from _utils.clean import normalize, normalize_frame, exclude_3sd

In [None]:
from config import derivatives_dir as derivs_dir
bins_dir = derivs_dir / '04.binarized'
scales_dir = derivs_dir / 'qualtrics' / '2.subscaled'
output_dir = derivs_dir / '05.subject-level'
if not Path.exists(output_dir): Path.mkdir(output_dir)

# Import data

In [None]:
fname = bins_dir / ('econdec-all_task-all_' + date + '.csv')
behav_data = pd.read_csv(fname).rename(columns={'subjnum':'ssid'})

In [None]:
behav_data.columns = [col.replace('-','_') for col in behav_data.columns]

In [None]:
fname = scales_dir / 'all_subscales.csv'
scale_data = pd.read_csv(fname)

In [None]:
behav_data.head()

In [None]:
scale_data.head()

# Normalize PANAS and BISBAS subscales

In [None]:
df = (scale_data.copy(deep=True)
          .rename(columns={
    'ZIP':'zip', 'Relative Sleep':'sleep', 'Relative Stress':'stress', 'Financial Difficulty':'fin_dif',
    'BAS Drive':'BAS_dr', 'BAS Fun Seeking':'BAS_fs', 'BAS Reward Responsiveness':'BAS_rr', 
    'Intuitive DMS': 'DMS_i', 'Rational DMS':'DMS_r', 'Dependent DMS':'DMS_d',
    'Spontaneous DMS':'DMS_s', 'Avoidant DMS':'DMS_a', 'Financial Literacy':'fin_lit'
}))

In [None]:
df['study'] = df['ssid'].astype(str).str[0]

Split

In [None]:
df1 = df.loc[df['study'] == '1']
df2 = df.loc[df['study'] == '2']
df3 = df.loc[df['study'] == '3']

Apply

In [None]:
cols_to_normalize = ['PAS','NAS','BIS','BAS_fs','BAS_rr','BAS_dr',
                     'DMS_i','DMS_r','DMS_d','DMS_s','DMS_a',]
z_cols = ['z_'+col for col in cols_to_normalize]

In [None]:
def z_score(df): return (df-df.mean()) / df.std(ddof=0)

In [None]:
df1[z_cols] = df1.loc[:,cols_to_normalize].apply(z_score)
df2[z_cols] = df2.loc[:,cols_to_normalize].apply(z_score)
df3[z_cols] = df3.loc[:,cols_to_normalize].apply(z_score)

Combine

In [None]:
output_df = pd.concat([df1,df2,df3]).sort_values('ssid')
output_df.sample(9)

### Natural Logarithm of subscales

We take the natural log of each subscale's RAW score.

*NOT* their normalized score, because we can't take the log of a negative.

In [None]:
ln_cols = ['ln_'+col for col in cols_to_normalize]
output_df[ln_cols] = output_df[cols_to_normalize].applymap(np.log)

In [None]:
output_df.sample(9)

# 3sd trial exclusions

In [None]:
from _utils.transform import group_exclude

In [None]:
behav_data['valError_3sd'] = group_exclude(behav_data, group_col = 'ssid', value_col = 'val_estdiff_valid')

# gender-judgment trial exclusions

We don't have the gender judgment data in the source file for this notebook, but this is where I want to exclude trials in which the gender judgment was wrong. Have to go further upstream to include the gender judgment in this source file, then apply the gender judgment exclusion here.

# subject-level means

In [None]:
domain_means = (behav_data.groupby(['ssid','domain'], as_index=False)
                        .mean())
domain_means.head(9)

In [None]:
gain_ave_val_error = domain_means[domain_means['domain'] == 'GAIN'][['ssid','valError_3sd']]
gain_ave_val_error = gain_ave_val_error.set_index('ssid')

In [None]:
loss_ave_val_error = domain_means[domain_means['domain'] == 'LOSS'][['ssid','valError_3sd']]
loss_ave_val_error = loss_ave_val_error.set_index('ssid')

### Framing Normalization
We want to normalize for the way the value estimation question is framed.

We're going to multiply valError means by `1` for subjects who were estimating the probability that the stock is *good*, and multiply means by `-1` for subjects who were estimating the probability that the stock is *bad*.

100s: `*  1`

200s: `* -1`

300s: `*  1`

In [None]:
means_df = behav_data.groupby('ssid').mean()[['valError_3sd']]
means_df[85:91]

In [None]:
nf_valerror = pd.DataFrame(means_df.apply(normalize_frame,axis=1))
nf_valerror = nf_valerror.rename(columns={0:'nf_valError'})
nf_valerror[85:91]

In [None]:
gain_ave_val_error[85:91]

In [None]:
nf_valerr_gain = pd.DataFrame(gain_ave_val_error.apply(normalize_frame,axis=1))
nf_valerr_gain = nf_valerr_gain.rename(columns={0:'nf_gainValError'})
nf_valerr_gain[85:91]

In [None]:
loss_ave_val_error[85:91]

In [None]:
nf_valerr_loss = pd.DataFrame(loss_ave_val_error.apply(normalize_frame,axis=1))
nf_valerr_loss = nf_valerr_loss.rename(columns={0:'nf_lossValError'})
nf_valerr_loss[85:91]

# Build Output DataFrame

In [None]:
output_df = (
    output_df.merge(behav_data.groupby('ssid', as_index=False)
                       .mean()[['ssid','valError_3sd']])
             .merge(gain_ave_val_error.reset_index()
                       .rename(columns={'valError_3sd':'gainValError'}))
             .merge(loss_ave_val_error.reset_index()
                       .rename(columns={'valError_3sd':'lossValError'}))
             .merge(nf_valerror.reset_index())
             .merge(nf_valerr_gain.reset_index())
             .merge(nf_valerr_loss.reset_index())
             #.merge(optimal_choice_freq).rename(columns={'waschoiceoptimal':'optimal_choice_freq'})
)

In [None]:
output_df['valWedge'] = abs(output_df['gainValError'] - output_df['lossValError'])
output_df['nf_valWedge'] = abs(output_df['nf_gainValError'] - output_df['nf_lossValError'])

In [None]:
fname = output_dir / ('subject-level_' +date +'.csv')
#columns = ['ssid','valError','gainValError','lossValError','valWedge'] + zkeys
output_df.to_csv(fname, index=False)

### Legacy output dataframe build below

Can be ignored.