In [1]:
import os, pandas as pd, numpy as np

In [2]:
source_dir = os.path.join('..','derivatives','0.3.cleaned')
output_dir = os.path.join('..','derivatives','0.4.joined')

In [3]:
frames=[pd.read_csv(os.path.join(source_dir,f)) for f in os.listdir(source_dir) if 'subject' in f]

# Cognitive Tasks

### Comprehension

In [4]:
comp_df = frames[0].set_index(['SSID','time'])
comp_df = comp_df.unstack('time')['Score-sum'].reset_index()
comp_df['comp_change'] = comp_df[2] - comp_df[1]
comp_df = comp_df.rename(columns={'SSID':'sub',1:'comp_t1',2:'comp_t2'})
comp_df.head()

time,sub,comp_t1,comp_t2,comp_change
0,101,9.0,12.0,3.0
1,102,10.0,11.0,1.0
2,103,7.0,13.0,6.0
3,104,9.0,15.0,6.0
4,105,8.0,9.0,1.0


### N-back

In [5]:
nback_df = frames[1][['sub','RT','CoR']].rename(columns={'RT':'nb_RT','CoR':'nb_CoR'})
nback_df.head()

Unnamed: 0,sub,nb_RT,nb_CoR
0,101,685.285714,0.8125
1,102,572.0,0.59375
2,103,618.571429,0.875
3,104,505.571429,0.875
4,105,715.25,0.53125


### ProcSpd

In [6]:
procspd_df = frames[2].rename(columns={'Subject':'sub','RT':'procspd_RT'})
procspd_df.head()

Unnamed: 0,sub,procspd_RT
0,101.0,308.5
1,102.0,303.85
2,103.0,289.85
3,104.0,302.95
4,105.0,391.052632


# Surveys

In [7]:
frames[3] = frames[3].rename(columns={'SSID':'sub'})
frames[3].head()

Unnamed: 0,Score-sum_x,sub,Condition,Age,Gender,EduYears,CurrentStudent,Major,Major_TEXT,SciDegree,...,SciTude-09,SciTude-10,SciTude-11r,SciTude-12,SciTude-13r,SciTude-14r,SciTude-15,SciTude-16r,SciTude-17r,SciTude-18r
0,30.0,101,1.0,20.0,2.0,14,1.0,5.0,Biomedical Sciences,,...,3,3,1,4,1,3,4,2,2,2
1,26.0,102,2.0,19.0,2.0,12,1.0,5.0,Forensic Science,,...,5,4,1,2,1,1,1,1,1,1
2,28.0,103,3.0,18.0,1.0,12,1.0,2.0,communications,,...,2,2,3,2,1,3,3,3,3,2
3,28.0,104,1.0,18.0,1.0,12,1.0,5.0,Statistics,,...,4,4,5,4,1,4,5,1,1,1
4,28.0,105,2.0,18.0,1.0,12,1.0,5.0,Biomedical Sciences,,...,1,3,4,4,1,2,4,1,1,1


### Demographics

In [8]:
def group_ages(age):
    if 18 <= age <= 35: return('1')
    if 65 <= age <= 90: return('2')

In [9]:
demog_df = frames[3].loc[:,[
    'sub','Condition','Age','Gender','EduYears','SciEdu_HS','SciEdu_UGrad','SciEdu_Grad'
]]

In [10]:
demog_df.loc[:,'AgeGroup'] = demog_df.loc[:,'Age'].apply(group_ages)

In [11]:
demog_df.head()

Unnamed: 0,sub,Condition,Age,Gender,EduYears,SciEdu_HS,SciEdu_UGrad,SciEdu_Grad,AgeGroup
0,101,1.0,20.0,2.0,14,4,7,0.0,1
1,102,2.0,19.0,2.0,12,3,3,0.0,1
2,103,3.0,18.0,1.0,12,3,0,0.0,1
3,104,1.0,18.0,1.0,12,4,0,0.0,1
4,105,2.0,18.0,1.0,12,6,0,0.0,1


#### Subscaling functions:
Needed for NFCS, TSSI, SciLit, SciTude, etc.

In [12]:
def sum_subscale(df,label):
    df = df.set_index('sub')
    df[label+'_sum'] = df.sum(axis=1)
    df = df.reset_index()
    return(df)

In [13]:
def reverse_score(df,max_likert,label):
    df[[
        c.strip('r') for c in df.columns if c.endswith('r')
    ]] = (max_likert +1) - df[[
        c for c in df.columns if c.endswith('r')
    ]]
    df = df[[c for c in df if not c.endswith('r')]]
    df = sum_subscale(df,label)
    return(df)

### NFCS
Need for Cognition Scale

In [14]:
nfcs_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('NFCS')]]
nfcs_df = nfcs_df.drop(columns='NFCS-00')
nfcs_df = reverse_score(nfcs_df,5,'NFCS')
nfcs_df.head()

Unnamed: 0,sub,NFCS-01,NFCS-02,NFCS-06,NFCS-10,NFCS-11,NFCS-13,NFCS-14,NFCS-15,NFCS-18,NFCS-03,NFCS-04,NFCS-05,NFCS-07,NFCS-08,NFCS-09,NFCS-12,NFCS-16,NFCS-17,NFCS_sum
0,101,4.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,2.0,4.0,2.0,2.0,4.0,2.0,3.0,61.0
1,102,2.0,2.0,1.0,5.0,2.0,5.0,4.0,5.0,3.0,4.0,1.0,5.0,1.0,1.0,1.0,4.0,1.0,1.0,48.0
2,103,1.0,3.0,1.0,3.0,5.0,1.0,3.0,3.0,3.0,3.0,4.0,1.0,3.0,2.0,2.0,3.0,2.0,2.0,45.0
3,104,4.0,4.0,2.0,5.0,5.0,4.0,3.0,2.0,3.0,4.0,4.0,4.0,5.0,5.0,2.0,4.0,4.0,5.0,69.0
4,105,4.0,4.0,1.0,5.0,5.0,1.0,2.0,5.0,2.0,4.0,2.0,1.0,1.0,3.0,1.0,4.0,2.0,2.0,49.0


### TSSI
Trust in Science and Scientists Inventory

In [15]:
tssi_df =  frames[3][['sub']+[c for c in frames[3].columns if c.startswith('TSSI')]]
tssi_df = tssi_df.drop(columns='TSSI-00')
tssi_df = reverse_score(tssi_df,5,'TSSI')
tssi_df.head()

Unnamed: 0,sub,TSSI-05,TSSI-07,TSSI-09,TSSI-10,TSSI-11,TSSI-12,TSSI-14,TSSI-15,TSSI-16,...,TSSI-04,TSSI-06,TSSI-08,TSSI-13,TSSI-17,TSSI-18,TSSI-19,TSSI-20,TSSI-21,TSSI_sum
0,101,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,5,4,87
1,102,5,5,3,5,3,5,4,5,5,...,3,4,4,5,3,4,3,4,4,83
2,103,2,3,3,3,3,3,4,3,4,...,5,5,4,5,3,3,4,3,3,76
3,104,1,5,5,5,5,1,5,5,5,...,5,5,5,5,5,5,5,5,5,97
4,105,2,5,5,5,5,3,4,3,5,...,3,2,4,5,4,5,5,1,5,82


### Science-Related Attitudes

In [16]:
scitude_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('SciTude')]]
scitude_df = scitude_df.drop(columns='SciTude-00')
scitude_df = reverse_score(scitude_df,5,'SciTude')
scitude_df.head()

Unnamed: 0,sub,SciTude-03,SciTude-04,SciTude-05,SciTude-06,SciTude-07,SciTude-09,SciTude-10,SciTude-12,SciTude-15,SciTude-01,SciTude-02,SciTude-08,SciTude-11,SciTude-13,SciTude-14,SciTude-16,SciTude-17,SciTude-18,SciTude_sum
0,101,5,4,4,4,4,3,3,4,4,3,5,2,5,5,3,4,4,4,70
1,102,5,5,1,3,2,5,4,2,1,3,4,4,5,5,5,5,5,5,69
2,103,5,5,3,5,2,2,2,2,3,4,5,2,3,5,3,3,3,4,61
3,104,5,5,5,5,4,4,4,4,5,5,5,3,1,5,2,5,5,5,77
4,105,4,3,5,4,1,1,3,4,4,2,3,2,2,5,4,5,5,5,62


### Science Literacy

In [17]:
sciLit_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('SciLit')]]
sciLit_df = sciLit_df.drop(columns='SciLit-00')
sciLit_df = sum_subscale(sciLit_df,'SciLit')
sciLit_df.head()

Unnamed: 0,sub,SciLit-01,SciLit-02,SciLit-03,SciLit-04,SciLit-05,SciLit-06,SciLit-07,SciLit_sum
0,101,2,1,2,3,3,2,3,16
1,102,4,1,1,1,4,2,3,16
2,103,3,1,1,3,2,2,3,15
3,104,3,1,2,3,2,2,3,16
4,105,2,1,3,3,1,2,3,15


### Openness to Experience

In [18]:
o2xp_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('O')]]
o2xp_df = o2xp_df.drop(columns=['Open-0','Original_Feedback'])
o2xp_df = o2xp_df.set_index('sub')
o2xp_subscales = list(set([c.split('-')[0] for c in o2xp_df.columns]))
for subscale in o2xp_subscales:
    o2xp_df[subscale+'_sum'] = o2xp_df[[c for c in o2xp_df.columns if c.startswith(subscale)]].sum(axis=1)
o2xp_df = o2xp_df.reset_index()
o2xp_df.head()

Unnamed: 0,sub,O1-3,O1-33,O1-63,O1-93,O2-8,O2-38,O2-68,O2-98,O3-13,...,O6-28,O6-58,O6-88,O6-118,O1_sum,O6_sum,O5_sum,O4_sum,O3_sum,O2_sum
0,101,2.0,2.0,3.0,2.0,4.0,4.0,4.0,2.0,4.0,...,4.0,4.0,2.0,5.0,9.0,15.0,10.0,11.0,11.0,14.0
1,102,3.0,4.0,5.0,5.0,5.0,5.0,2.0,1.0,5.0,...,3.0,2.0,4.0,4.0,17.0,13.0,15.0,17.0,14.0,13.0
2,103,4.0,4.0,4.0,4.0,3.0,4.0,3.0,3.0,4.0,...,4.0,4.0,1.0,3.0,16.0,12.0,9.0,16.0,12.0,13.0
3,104,5.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,5.0,...,4.0,2.0,3.0,2.0,17.0,11.0,11.0,14.0,12.0,13.0
4,105,5.0,5.0,2.0,5.0,1.0,4.0,2.0,2.0,5.0,...,3.0,3.0,3.0,2.0,17.0,11.0,12.0,14.0,18.0,9.0


### Shipley Vocab

In [19]:
vocab_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('Q') or c.startswith('Score-')]]
vocab_df = vocab_df.rename(columns={'Score-sum_x':'vocab_sum'})
vocab_df.head()

Unnamed: 0,sub,vocab_sum,Score-sum_y
0,101,30.0,0.0
1,102,26.0,0.0
2,103,28.0,0.0
3,104,28.0,0.0
4,105,28.0,0.0


# Output

In [20]:
output_df = demog_df.merge(comp_df[['sub','comp_t1','comp_t2','comp_change']]
                    ).merge(nback_df[['sub','nb_RT','nb_CoR']]
                    ).merge(procspd_df[['sub','procspd_RT']]
                    ).merge(nfcs_df[['sub','NFCS_sum']]
                    ).merge(tssi_df[['sub','TSSI_sum']]
                    ).merge(vocab_df[['sub','vocab_sum']]
                    ).merge(sciLit_df[['sub','SciLit_sum']]
                    ).merge(scitude_df[['sub','SciTude_sum']]
                    ).merge(o2xp_df[['sub','O1_sum','O2_sum','O3_sum','O4_sum','O5_sum','O6_sum']])
output_df

Unnamed: 0,sub,Condition,Age,Gender,EduYears,SciEdu_HS,SciEdu_UGrad,SciEdu_Grad,AgeGroup,comp_t1,...,TSSI_sum,vocab_sum,SciLit_sum,SciTude_sum,O1_sum,O2_sum,O3_sum,O4_sum,O5_sum,O6_sum
0,101,1.0,20.0,2.0,14,4,7,0.0,1,9.0,...,87,30.0,16,70,9.0,14.0,11.0,11.0,10.0,15.0
1,102,2.0,19.0,2.0,12,3,3,0.0,1,10.0,...,83,26.0,16,69,17.0,13.0,14.0,17.0,15.0,13.0
2,103,3.0,18.0,1.0,12,3,0,0.0,1,7.0,...,76,28.0,15,61,16.0,13.0,12.0,16.0,9.0,12.0
3,104,1.0,18.0,1.0,12,4,0,0.0,1,9.0,...,97,28.0,16,77,17.0,13.0,12.0,14.0,11.0,11.0
4,105,2.0,18.0,1.0,12,6,0,0.0,1,8.0,...,82,28.0,15,62,17.0,9.0,18.0,14.0,12.0,11.0
5,106,3.0,18.0,1.0,12,4,0,0.0,1,12.0,...,77,26.0,14,70,15.0,17.0,8.0,13.0,10.0,12.0
6,107,1.0,18.0,2.0,12,4,1,0.0,1,10.0,...,87,23.0,16,62,20.0,12.0,12.0,13.0,6.0,15.0
7,108,2.0,18.0,2.0,12,5,0,0.0,1,6.0,...,76,22.0,15,63,16.0,13.0,13.0,15.0,14.0,11.0
8,109,3.0,18.0,2.0,13,5,2,0.0,1,9.0,...,95,34.0,16,86,20.0,12.0,12.0,8.0,8.0,12.0
9,110,1.0,19.0,1.0,12,4,0,0.0,1,10.0,...,78,28.0,14,70,15.0,13.0,10.0,12.0,7.0,10.0


In [21]:
output_df.to_csv(os.path.join(output_dir,'all_subject_level.csv'),index=False)

# Next step
## Primary Analyses
All pre-processing is done, so now we can move on to [1.1.analysis.ipynb](1.1.analysis.ipynb), to start looking at our demographics breakdown, correlations, and group ANOVAs.