# Subject Subset

Sample subjects and store the subset datasets.

`Top 100`: sample $1$ constitutes the top $100$ scoring subjects.
`Random 100`: sample $2$ constitutes a random set of $100$ subjects.

----
```
author:     Zach Wolpe
email:      zachcolinwolpe@gmail.com
date:       21 January 2022
```
----

In [1]:
import sys
sys.path.append('../process data')
from dependencies import *

  from pandas import Int64Index as NumericIndex


In [8]:
# load -------------------------------------*
loc         = '/Users/zachwolpe/Documents/Production/Dynocog/Python Implementation/final instance/model-free analysis/final_dataframes'
sav_loc     = '/Users/zachwolpe/Documents/Production/Dynocog/Python Implementation/final instance/data objects/final data objects/'
wcst_data   = pd.read_pickle(loc + '/wcst_raw_data.pkl')
wcst        = pd.read_pickle(loc + '/wcst.pkl')
covariates  = pd.read_pickle(loc + '/covariates.pkl')
wcst

Unnamed: 0,participant,reward,status,action,rule,rule_correct,rule_used,n_t
0,816404.0,1,1,1,shape,shape,shape,0
1,816404.0,1,1,1,shape,shape,shape,1
2,816404.0,1,1,1,shape,shape,shape,2
3,816404.0,1,1,1,shape,shape,shape,3
4,816404.0,1,1,1,shape,shape,shape,4
...,...,...,...,...,...,...,...,...
27395,684712.0,1,1,3,color,color,color,95
27396,684712.0,1,1,3,color,color,color,96
27397,684712.0,1,1,3,color,color,color,97
27398,684712.0,1,1,3,color,color,color,98


# Raw demographic summary

In [43]:
demo = covariates[[i for i in covariates.columns if 'demo' in i]]
print('subjects: ', len(demo.index.unique()))
demo.demographics_gender_a.value_counts()                           # gender
demo.demographics_income_a.value_counts()                           # income
demo.demographics_handedness_a.value_counts()                       # handedness
demo.demographics_education_a.value_counts()                        # education


demo.demographics_age_a.mean(), demo.demographics_age_a.std()
# demo.demographics_income_a.mean(), demo.demographics_income_a.std()
# demo.demographics_computer_hours_a.mean(), demo.demographics_computer_hours_a.std()


subjects:  214


(37.598130841121495, 12.33785716786093)

# Subset data

1. Top 100
2. Random set

In [3]:
# top 100 participants ------------------------------------------------------------------------------------------------------------*
top_subjects = covariates[['wcst_accuracy']].reset_index().sort_values('wcst_accuracy', ascending=False).participant.values[:100]
wcst_top_100 = wcst.loc[wcst.participant.isin(top_subjects)]


# random 100 lam > 40 --------------------------------------------------------------------------------------------------------------*
np.random.seed(0)
lam             = 0.40
subjects        = covariates.index[covariates.wcst_accuracy > lam]
subjects        = np.random.choice(np.unique(subjects), 100, replace=False)
wcst_random_set = wcst.loc[wcst.participant.isin(subjects)]

# save dataframes ------------------------------------------------------------------------------------------------------------------*
wcst_top_100.to_pickle(sav_loc      + 'df_wcst_top_100.pkl')
wcst_top_100.to_csv(sav_loc         + 'df_wcst_top_100.csv')
wcst_random_set.to_pickle(sav_loc   + 'df_wcst_ran_100.pkl')
wcst_random_set.to_csv(sav_loc      + 'df_wcst_ran_100.csv')

# pyStan data object


In [15]:
# top 100
def pyStan_data_object(wcst_set, sav_loc, save=True):
    wcst_set.action.loc[wcst_set.action.isna(),]    = 0
    wcst_set.action                                 = wcst_set.action.astype(int)
    wcst_set.reward                                 = wcst_set.reward.astype(int)
    action_matrix = wcst_set[['participant' ,'n_t', 'action']].pivot(index='participant', columns='n_t')
    reward_matrix = wcst_set[['participant' ,'n_t', 'reward']].pivot(index='participant', columns='n_t')
    data_object = {
        'n_s':    reward_matrix.shape[0],
        'n_t':    reward_matrix.shape[1],
        'action': action_matrix,
        'reward': reward_matrix+1
    }
    # import pickle
    if save:
        with open(sav_loc, "wb") as f:
            pickle.dump(data_object, f, protocol=-1)
    return data_object




pyStan_data_object(wcst_top_100,    sav_loc+'pyStan_data_object_wcst_top_100.pkl')
pyStan_data_object(wcst_random_set, sav_loc+'pyStan_data_object_wcst_random_set.pkl')

{'n_s': 100,
 'n_t': 100,
 'action':             action                             ...                             \
 n_t             0  1  2  3  4  5  6  7  8  9   ... 90 91 92 93 94 95 96 97 98   
 participant                                    ...                              
 107700.0         3  3  1  1  1  1  1  1  1  1  ...  1  1  1  1  3  3  1  3  3   
 117306.0         1  1  1  1  1  1  1  3  1  1  ...  1  1  1  1  0  3  3  3  3   
 120307.0         3  3  2  3  3  0  3  3  3  3  ...  1  1  1  1  3  3  3  3  3   
 130202.0         3  3  1  1  1  1  1  1  1  1  ...  1  1  1  1  3  3  3  3  3   
 130501.0         3  3  2  3  2  0  1  1  1  1  ...  1  1  1  1  3  3  3  3  3   
 141199.0         1  3  2  2  1  0  3  2  3  1  ...  1  1  1  1  3  1  3  3  3   
 152049.0         3  2  3  1  1  1  1  1  1  1  ...  1  1  1  1  3  3  3  3  3   
 153366.0         1  1  1  1  1  1  1  1  1  1  ...  1  1  1  1  3  3  3  3  3   
 157836.0         1  3  2  1  1  3  1  1  1  2  ...  1  1  1  

# Psychological & Demographic Covariates

In [16]:
psychological_covariates    = covariates[['wcst_RT', 'fitts_mean_deviation', 'nback_status', 'nback_reaction_time_ms', 'navon_perc_correct', 'navon_reaction_time_ms', 'corsi_block_span']]
demographic_covariates      = covariates[['demographics_mean_reation_time_ms', 'demographics_income_a', 'demographics_computer_hours_a', 'demographics_age_a']]

# save dataframes ------------------------------------------------------------------------------------------------------------------*
psychological_covariates.to_csv(sav_loc + 'psychological_covariates.csv')
demographic_covariates.to_csv(sav_loc   + 'demographic_covariates.csv')

In [17]:
def compute_X_matrix(df1, df2, params):
  theta = []
  theta.append('participant')
  [theta.append(i) for i in params]
  return df1.set_index('participant').join(df2[theta])[params].drop_duplicates()


In [18]:

def covariate_set(df1, df2):
    df2 = df2.reset_index()
    return df1.merge(df2, how='left', on='participant')[df2.columns].drop_duplicates()

psy_covars_top100 = covariate_set(wcst_top_100,     psychological_covariates)
dem_covars_top100 = covariate_set(wcst_top_100,     demographic_covariates)
psy_covars_ran100 = covariate_set(wcst_random_set,  psychological_covariates)
dem_covars_ran100 = covariate_set(wcst_random_set,  demographic_covariates)


psy_covars_top100.to_csv(sav_loc + 'psy_covars_top100.csv')
dem_covars_top100.to_csv(sav_loc + 'dem_covars_top100.csv')
psy_covars_ran100.to_csv(sav_loc + 'psy_covars_ran100.csv')
dem_covars_ran100.to_csv(sav_loc + 'dem_covars_ran100.csv')
