In [42]:
import pandas as pd
import urllib

In [43]:
# data source: http://preprocessed-connectomes-project.org/abide/download.html
# data used: (1) summary data sheet of phenotypic data
#            (2) ANTS cortical thickness data (separate text file per participant)

In [44]:
# load the phenotypic data (downloaded from above url)

abide_pheno_df = pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')

In [45]:
abide_pheno_df.shape

(1112, 106)

In [46]:
abide_pheno_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,0,1,50002,1,50002,PITT,no_filename,1,1,16.77,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,...,,OK,,OK,,OK,,OK,,1
2,2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,...,,OK,,OK,,OK,,OK,,1
3,3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


In [47]:
# create a list of file names which we will use to open each subject's text file

file_list = list(abide_pheno_df['FILE_ID'])

In [48]:
len(file_list)

1112

In [52]:
len(abide_pheno_df[abide_pheno_df['FILE_ID'] == 'no_filename'])

77

In [53]:
# create an empty dataframe to store structural cortical thickness data

abide_struct_df = pd.DataFrame()

In [54]:
# open each subject's text file of cortical thickness data (excluding subjects who do not have a file)
# read each file as a dataframe
# add a 'FILE_ID' column to each dataframe for easier merging with pheno data later on
# concatenate each subject's dataframe with the main dataframe of cortical thickness data

for file in file_list:
    if file != 'no_filename':
        temp_file = urllib.request.urlopen('https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/ants/roi_thickness/{}_roi_thickness.txt'.format(file))
        temp_df = pd.read_csv(temp_file, sep = '\t')
        temp_df['FILE_ID'] = file
        abide_struct_df = pd.concat([abide_struct_df, temp_df])

In [55]:
# alternatively, we could download the individual text files and then open and merge them afterward:
# for file in file_list:
#     if file != 'no_filename':
#         urllib.request.urlretrive('https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/ants/ roi_thickness/{}_roi_thickness.txt'.format(file),
#                                   '{}_roi_thickness.txt'.format(file))

In [56]:
abide_struct_df.shape

(1035, 100)

In [59]:
# this is expected since 1112 subs - 77 subs without a file = 1035 subs in final dataset

In [57]:
abide_struct_df.head()

Unnamed: 0,File,Sub-brick,Mean_4,Mean_5,Mean_6,Mean_7,Mean_10,Mean_11,Mean_12,Mean_13,...,Mean_2025,Mean_2026,Mean_2027,Mean_2028,Mean_2029,Mean_2030,Mean_2031,Mean_2034,Mean_2035,FILE_ID
0,/data/Projects/ABIDE_Initiative/Outputs/ants/a...,0[?],0.003061,1.040163,0.140702,9e-06,0.002665,0.0,0.0,0.0,...,2.125226,3.498914,2.340307,2.085285,0.950366,1.807719,1.565881,1.833145,3.194357,Pitt_0050003
0,/data/Projects/ABIDE_Initiative/Outputs/ants/a...,0[?],0.011357,0.993277,0.083727,4.2e-05,0.004407,0.0,0.0,0.0,...,2.715785,3.871004,2.626445,2.471903,1.558296,2.666532,2.430273,2.103015,4.404319,Pitt_0050004
0,/data/Projects/ABIDE_Initiative/Outputs/ants/a...,0[?],0.016591,0.872625,0.078526,1.4e-05,0.003133,0.0,0.0,0.0,...,2.147347,3.228483,2.53906,2.405678,1.354254,1.967489,1.721139,1.411481,3.148464,Pitt_0050005
0,/data/Projects/ABIDE_Initiative/Outputs/ants/a...,0[?],0.015656,1.396618,0.110892,2.2e-05,0.003367,0.0,0.0,2.7e-05,...,3.026796,4.073865,2.855339,2.869385,1.870028,2.676286,2.531543,2.043278,3.620175,Pitt_0050006
0,/data/Projects/ABIDE_Initiative/Outputs/ants/a...,0[?],0.003159,0.543084,0.087172,4e-06,0.001864,0.0,0.0,0.0,...,2.176468,3.241596,2.278591,2.171605,1.151163,1.94042,1.956121,1.299927,2.978172,Pitt_0050007


In [60]:
# save dataframe to csv

abide_struct_df.to_csv('abide_cortical_thickness_20190701.csv')