In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)

def add_data(in_file, col, sub_col, colname_old, colname_new, data):
    coltypes = {'subjectkey': str, colname_old: float}
    colnames = ['Sub_Key', colname_new]
    data_curr = pd.read_table(in_file, sep='\t', header=0, skiprows=[1], usecols=[sub_col, col], dtype=coltypes, names=colnames)
    data_curr = data_curr.dropna().reset_index(drop=True).drop_duplicates(subset='Sub_Key')
    if data.empty:
        data = data_curr
    else:
        data = data.merge(data_curr, how='inner', on='Sub_Key')
    
    return data

In [6]:
# Psychometric variables
data = add_data('HCP-A/Package_1193170/cogcomp01.txt', 14, 4, 'nih_fluidcogcomp_ageadjusted', 'CogFluidComp_AgeAdj', pd.DataFrame())
data = add_data('HCP-A/Package_1193170/nffi01.txt', 78, 4, 'neo2_score_op', 'NEOFAC_O', data)

# Confounding variables
data= add_data('HCP-A/Package_1193170/tlbx_motor01.txt', 22, 4, 'grip_standardsc_dom', 'Strength_Dom_Unadj', data)
data = add_data('HCP-A/Package_1193170/tlbx_motor01.txt', 23, 4, 'grip_standardsc_nondom', 'Strength_NonDom_Unadj', data)
data = add_data('HCP-A/Package_1193170/ssaga_cover_demo01.txt', 5, 4, 'interview_age', 'Age', data)
data = add_data('HCP-A/Package_1193170/ssaga_cover_demo01.txt', 7, 4, 'sex', 'Sex', data)
data = add_data('HCP-A/Package_1193170/edinburgh_hand01.txt', 70, 5, 'hcp_handedness_score', 'Handedness', data)
data['Sex'] = pd.get_dummies(data['Sex']).astype(int)
data = data.assign(Age2=np.power(data['Age'], 2))
data = data.assign(SexAge=data['Sex']*data['Age'])
data = data.assign(SexAge2=data['Sex'] * np.power(data['Age'], 2))

# Remove 3 subjects with missing scans
data.drop(index=data.loc[data['Sub_Key']=='HCA6010538'].index, inplace=True)
data.drop(index=data.loc[data['Sub_Key']=='HCA6792190'].index, inplace=True)
data.drop(index=data.loc[data['Sub_Key']=='HCA8239378'].index, inplace=True)

# save outputs separately
data[['CogFluidComp_AgeAdj', 'NEOFAC_O']].to_csv('HCP-A/HCP-A_AllComp.csv', index=None, header=None)

(571, 43)
