In [1]:
import os, pandas as pd, numpy as np

In [2]:
source_dir = os.path.join('..','derivatives','cleaned')
output_dir = os.path.join('..','derivatives','all_data')

In [3]:
frames=[pd.read_csv(os.path.join(source_dir,f)) for f in os.listdir(source_dir) if 'subject' in f]

# Cognitive Tasks

### Comprehension

In [4]:
comp_df = pd.DataFrame(
    {
        'sub':frames[0]['SSID'].unique(),
        'comp_change':frames[0].groupby('SSID').diff()['Score-sum'].dropna()
    }
)
comp_df.head()

Unnamed: 0,sub,comp_change
1,101,3.0
3,102,1.0
5,103,6.0
7,104,6.0
9,105,1.0


### N-back

In [5]:
nback_df = frames[1][['sub','RT','CoR']].rename(columns={'RT':'nb_RT','CoR':'nb_CoR'})
nback_df.head()

Unnamed: 0,sub,nb_RT,nb_CoR
0,101,779.888889,0.8125
1,102,626.272727,0.59375
2,103,618.571429,0.875
3,104,505.571429,0.875
4,105,742.913043,0.53125


### ProcSpd

In [6]:
procspd_df = frames[2].rename(columns={'Subject':'sub','RT':'procspd_RT'})
procspd_df.head()

Unnamed: 0,sub,procspd_RT
0,101.0,308.5
1,102.0,303.85
2,103.0,289.85
3,104.0,302.95
4,105.0,391.052632


# Surveys

In [7]:
frames[3] = frames[3].rename(columns={'SSID':'sub'})
frames[3].head()

Unnamed: 0,Score-sum_x,sub,Condition,Age,Gender,EduYears,CurrentStudent,Major,Major_TEXT,SciDegree,...,SciAtt-09r,SciAtt-10r,SciAtt-11,SciAtt-12r,SciAtt-13,SciAtt-14,SciAtt-15r,SciAtt-16,SciAtt-17,SciAtt-18
0,30.0,101,1.0,20.0,2.0,14.0,1.0,5.0,Biomedical Sciences,,...,3,3,1,4,1,3,4,2,2,2
1,26.0,102,2.0,19.0,2.0,12.0,1.0,5.0,Forensic Science,,...,5,4,1,2,1,1,1,1,1,1
2,28.0,103,3.0,18.0,1.0,12.0,1.0,2.0,communications,,...,2,2,3,2,1,3,3,3,3,2
3,28.0,104,1.0,18.0,1.0,12.0,1.0,5.0,Statistics,,...,4,4,5,4,1,4,5,1,1,1
4,28.0,105,2.0,18.0,1.0,12.0,1.0,5.0,Biomedical Sciences,,...,1,3,4,4,1,2,4,1,1,1


In [8]:
def sum_subscale(df,label):
    df = df.set_index('sub')
    df[label+'_sum'] = df.sum(axis=1)
    df = df.reset_index()
    return(df)

In [9]:
def reverse_score(df,max_likert,label):
    df[[
        c.strip('r') for c in df.columns if c.endswith('r')
    ]] = (max_likert +1) - df[[
        c for c in df.columns if c.endswith('r')
    ]]
    df = df[[c for c in df if not c.endswith('r')]]
    df = sum_subscale(df,label)
    return(df)

### NFCS
Need for Cognition Scale

In [10]:
nfcs_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('NFCS')]]
nfcs_df = nfcs_df.drop(columns='NFCS-00')
nfcs_df = reverse_score(nfcs_df,5,'NFCS')
nfcs_df.head()

Unnamed: 0,sub,NFCS-01,NFCS-02,NFCS-06,NFCS-10,NFCS-11,NFCS-13,NFCS-14,NFCS-15,NFCS-18,NFCS-03,NFCS-04,NFCS-05,NFCS-07,NFCS-08,NFCS-09,NFCS-12,NFCS-16,NFCS-17,NFCS_sum
0,101,4.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,2.0,4.0,2.0,2.0,4.0,2.0,3.0,61.0
1,102,2.0,2.0,1.0,5.0,2.0,5.0,4.0,5.0,3.0,4.0,1.0,5.0,1.0,1.0,1.0,4.0,1.0,1.0,48.0
2,103,1.0,3.0,1.0,3.0,5.0,1.0,3.0,3.0,3.0,3.0,4.0,1.0,3.0,2.0,2.0,3.0,2.0,2.0,45.0
3,104,4.0,4.0,2.0,5.0,5.0,4.0,3.0,2.0,3.0,4.0,4.0,4.0,5.0,5.0,2.0,4.0,4.0,5.0,69.0
4,105,4.0,4.0,1.0,5.0,5.0,1.0,2.0,5.0,2.0,4.0,2.0,1.0,1.0,3.0,1.0,4.0,2.0,2.0,49.0


### TSSI
Trust in Science and Scientists Inventory

In [11]:
tssi_df =  frames[3][['sub']+[c for c in frames[3].columns if c.startswith('TSSI')]]
tssi_df = tssi_df.drop(columns='TSSI-00')
tssi_df = reverse_score(tssi_df,5,'TSSI')
tssi_df.head()

Unnamed: 0,sub,TSSI-05,TSSI-07,TSSI-09,TSSI-10,TSSI-11,TSSI-12,TSSI-14,TSSI-15,TSSI-16,...,TSSI-04,TSSI-06,TSSI-08,TSSI-13,TSSI-17,TSSI-18,TSSI-19,TSSI-20,TSSI-21,TSSI_sum
0,101,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,5,4,87
1,102,5,5,3,5,3,5,4,5,5,...,3,4,4,5,3,4,3,4,4,83
2,103,2,3,3,3,3,3,4,3,4,...,5,5,4,5,3,3,4,3,3,76
3,104,1,5,5,5,5,1,5,5,5,...,5,5,5,5,5,5,5,5,5,97
4,105,2,5,5,5,5,3,4,3,5,...,3,2,4,5,4,5,5,1,5,82


### Science-Related Attitudes

In [12]:
scitude_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('SciAtt')]]
scitude_df = scitude_df.drop(columns='SciAtt-00')
scitude_df = reverse_score(scitude_df,5,'SciTude')
scitude_df.head()

Unnamed: 0,sub,SciAtt-01,SciAtt-02,SciAtt-08,SciAtt-11,SciAtt-13,SciAtt-14,SciAtt-16,SciAtt-17,SciAtt-18,SciAtt-03,SciAtt-04,SciAtt-05,SciAtt-06,SciAtt-07,SciAtt-09,SciAtt-10,SciAtt-12,SciAtt-15,SciTude_sum
0,101,3,1,4,1,1,3,2,2,2,1,2,2,2,2,3,3,2,2,38
1,102,3,2,2,1,1,1,1,1,1,1,1,5,3,4,1,2,4,5,39
2,103,2,1,4,3,1,3,3,3,2,1,1,3,1,4,4,4,4,3,47
3,104,1,1,3,5,1,4,1,1,1,1,1,1,1,2,2,2,2,1,31
4,105,4,3,4,4,1,2,1,1,1,2,3,1,2,5,5,3,2,2,46


### Science Literacy

In [13]:
sciLit_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('SciLit')]]
sciLit_df = sciLit_df.drop(columns='SciLit-00')
sciLit_df = sum_subscale(sciLit_df,'SciLit')
sciLit_df.head()

Unnamed: 0,sub,SciLit-01,SciLit-02,SciLit-03,SciLit-04,SciLit-05,SciLit-06,SciLit-07,SciLit_sum
0,101,2,1,2,3,3,2,3,16
1,102,4,1,1,1,4,2,3,16
2,103,3,1,1,3,2,2,3,15
3,104,3,1,2,3,2,2,3,16
4,105,2,1,3,3,1,2,3,15


### Openness to Experience

In [14]:
o2xp_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('O')]]
o2xp_df = o2xp_df.drop(columns=['Open-0','Original_Feedback'])
o2xp_df = o2xp_df.set_index('sub')
o2xp_subscales = list(set([c.split('-')[0] for c in o2xp_df.columns]))
for subscale in o2xp_subscales:
    o2xp_df[subscale+'_sum'] = o2xp_df[[c for c in o2xp_df.columns if c.startswith(subscale)]].sum(axis=1)
o2xp_df = o2xp_df.reset_index()
o2xp_df.head()

Unnamed: 0,sub,O1-3,O1-33,O1-63,O1-93,O2-8,O2-38,O2-68,O2-98,O3-13,...,O6-28,O6-58,O6-88,O6-118,O4_sum,O3_sum,O2_sum,O1_sum,O5_sum,O6_sum
0,101,2.0,2.0,3.0,2.0,4.0,4.0,4.0,2.0,4.0,...,4.0,4.0,2.0,5.0,11.0,11.0,14.0,9.0,10.0,15.0
1,102,3.0,4.0,5.0,5.0,5.0,5.0,2.0,1.0,5.0,...,3.0,2.0,4.0,4.0,17.0,14.0,13.0,17.0,15.0,13.0
2,103,4.0,4.0,4.0,4.0,3.0,4.0,3.0,3.0,4.0,...,4.0,4.0,1.0,3.0,16.0,12.0,13.0,16.0,9.0,12.0
3,104,5.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,5.0,...,4.0,2.0,3.0,2.0,14.0,12.0,13.0,17.0,11.0,11.0
4,105,5.0,5.0,2.0,5.0,1.0,4.0,2.0,2.0,5.0,...,3.0,3.0,3.0,2.0,14.0,18.0,9.0,17.0,12.0,11.0


### Shipley Vocab

In [15]:
vocab_df = frames[3][['sub']+[c for c in frames[3].columns if c.startswith('Q') or c.startswith('Score-')]]
vocab_df = vocab_df.rename(columns={'Score-sum_x':'vocab_sum'})
vocab_df.head()

Unnamed: 0,sub,vocab_sum,Q35,Q36,Q37,Q38,Q39,Q40,Q41,Q42,...,Q67,Q68,Q69,Q70,Q71,Q72,Q73,Q74,Q75,Score-sum_y
0,101,30.0,1.0,3.0,1.0,1.0,3.0,2.0,3.0,4.0,...,3.0,2.0,4.0,1.0,4.0,1.0,2.0,1.0,3.0,0.0
1,102,26.0,1.0,3.0,1.0,1.0,3.0,2.0,1.0,4.0,...,3.0,4.0,2.0,3.0,1.0,1.0,3.0,2.0,1.0,0.0
2,103,28.0,1.0,3.0,1.0,1.0,3.0,2.0,3.0,4.0,...,3.0,2.0,2.0,1.0,1.0,1.0,3.0,3.0,1.0,0.0
3,104,28.0,1.0,3.0,1.0,1.0,3.0,2.0,3.0,2.0,...,4.0,4.0,1.0,3.0,3.0,3.0,2.0,1.0,2.0,0.0
4,105,28.0,1.0,3.0,1.0,1.0,3.0,2.0,3.0,4.0,...,1.0,1.0,3.0,2.0,1.0,2.0,4.0,1.0,2.0,0.0


# Output

In [16]:
output_df = comp_df.merge(nback_df[['sub','nb_RT','nb_CoR']])
output_df = output_df.merge(procspd_df[['sub','procspd_RT']])
output_df = output_df.merge(nfcs_df[['sub','NFCS_sum']])
output_df = output_df.merge(tssi_df[['sub','TSSI_sum']])
output_df = output_df.merge(vocab_df[['sub','vocab_sum']])
output_df = output_df.merge(sciLit_df[['sub','SciLit_sum']])
output_df = output_df.merge(scitude_df[['sub','SciTude_sum']])
output_df = output_df.merge(o2xp_df[['sub','O1_sum','O2_sum','O3_sum','O4_sum','O5_sum','O6_sum']])
output_df

Unnamed: 0,sub,comp_change,nb_RT,nb_CoR,procspd_RT,NFCS_sum,TSSI_sum,vocab_sum,SciLit_sum,SciTude_sum,O1_sum,O2_sum,O3_sum,O4_sum,O5_sum,O6_sum
0,101,3.0,779.888889,0.8125,308.5,61.0,87,30.0,16,38,9.0,14.0,11.0,11.0,10.0,15.0
1,102,1.0,626.272727,0.59375,303.85,48.0,83,26.0,16,39,17.0,13.0,14.0,17.0,15.0,13.0
2,103,6.0,618.571429,0.875,289.85,45.0,76,28.0,15,47,16.0,13.0,12.0,16.0,9.0,12.0
3,104,6.0,505.571429,0.875,302.95,69.0,97,28.0,16,31,17.0,13.0,12.0,14.0,11.0,11.0
4,105,1.0,742.913043,0.53125,391.052632,49.0,82,28.0,15,46,17.0,9.0,18.0,14.0,12.0,11.0
5,106,4.0,1186.7,0.3125,264.210526,75.0,77,26.0,14,38,15.0,17.0,8.0,13.0,10.0,12.0
6,107,0.0,779.25,1.0,300.5,62.0,87,23.0,16,46,20.0,12.0,12.0,13.0,6.0,15.0
7,108,6.0,721.75,0.84375,341.1,53.0,76,22.0,15,45,16.0,13.0,13.0,15.0,14.0,11.0
8,109,3.0,764.0,0.84375,311.85,83.0,95,34.0,16,22,20.0,12.0,12.0,8.0,8.0,12.0
9,110,3.0,862.857143,0.875,305.75,58.0,78,28.0,14,38,15.0,13.0,10.0,12.0,7.0,10.0


In [18]:
output_df.to_csv(os.path.join(output_dir,'subject_level_data.csv'),index=False)