# ABIDE I and ABIDE II phenotypic dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
abide1 = pd.read_csv("./data/Phenotypic_V1_0b.csv")
abide2 = pd.read_csv("./data/ABIDEII_Composite_Phenotypic.csv")

## Rename site names so that it is consistent throughout two datasets

In [3]:
pd.unique(abide1.SITE_ID)

array(['CALTECH', 'CMU', 'KKI', 'LEUVEN_1', 'LEUVEN_2', 'MAX_MUN', 'NYU',
       'OHSU', 'OLIN', 'PITT', 'SBL', 'SDSU', 'STANFORD', 'TRINITY',
       'UCLA_1', 'UCLA_2', 'UM_1', 'UM_2', 'USM', 'YALE'], dtype=object)

In [5]:
pd.unique(abide2.SITE_ID)

array(['ABIDEII-BNI_1', 'ABIDEII-EMC_1', 'ABIDEII-ETH_1', 'ABIDEII-GU_1',
       'ABIDEII-IP_1', 'ABIDEII-IU_1', 'ABIDEII-KKI_1', 'ABIDEII-KUL_3',
       'ABIDEII-NYU_1', 'ABIDEII-NYU_2', 'ABIDEII-OHSU_1',
       'ABIDEII-OILH_2', 'ABIDEII-SDSU_1', 'ABIDEII-SU_2',
       'ABIDEII-TCD_1', 'ABIDEII-UCD_1', 'ABIDEII-UCLA_1',
       'ABIDEII-U_MIA_1', 'ABIDEII-USM_1'], dtype=object)

 simple naming convention is used to label each data collection: <ABIDEII>_<institution> acronym name>_ocollection number>(e.g., ABIDEII-NYU_1). When a collection in ABIDE II is a continuation of one initiated in ABIDE I, we employ the same collection number used in ABIDE I (or 1 if none was used, e.g., SDSU_1, KKI_1). For new collections, a unique consecutive number is assigned (e.g., BNI_1, KUL_3). Accompanying the primary cross-sectional aggregate, two longitudinal collections are also aggregated in ABIDE II. These include MRI datasets collected as follow-ups to the MRI and phenotypic data released in ABIDE I (N total = 38 unique IDs). These pilot longitudinal collections are identified as <ABIDEII> -institution acronym name>_<- Long> (Table 1).

Misc notes. 

- Leuven in ABIDE I = KUL in ABIDE II
- OLIN in ABIDE I = OILH in ABIDE II
- Trinity in ABIDE I = TCD in ABIDE II

In [6]:
# Remove ABIDEII

for row, data in abide2.SITE_ID.iteritems():
    new_site = data.split("-")[-1]
    abide2.loc[row, "SITE_ID"] = new_site

In [7]:
# Rename all none numbered sites in ABIDE I

for row, data in abide1.SITE_ID.iteritems():
    split = data.split("_")

    if not (data.endswith("_1") or data.endswith("_2")):
        abide1.loc[row, "SITE_ID"] = data + "_1"

In [8]:
# Rename and consolidate site names

# Leuven to KUL
abide1.SITE_ID.replace("LEUVEN_1", "KUL_1", inplace=True)
abide1.SITE_ID.replace("LEUVEN_2", "KUL_2", inplace=True)

# OLIN to OILH
abide1.SITE_ID.replace("OLIN_1", "OILH_1", inplace=True)

# Trinity to TCD
abide1.SITE_ID.replace("TRINITY_1", "TCD_1", inplace=True)

In [9]:
np.union1d(abide1.SITE_ID, abide2.SITE_ID)

array(['BNI_1', 'CALTECH_1', 'CMU_1', 'EMC_1', 'ETH_1', 'GU_1', 'IP_1',
       'IU_1', 'KKI_1', 'KUL_1', 'KUL_2', 'KUL_3', 'MAX_MUN_1', 'NYU_1',
       'NYU_2', 'OHSU_1', 'OILH_1', 'OILH_2', 'PITT_1', 'SBL_1', 'SDSU_1',
       'STANFORD_1', 'SU_2', 'TCD_1', 'UCD_1', 'UCLA_1', 'UCLA_2', 'UM_1',
       'UM_2', 'USM_1', 'U_MIA_1', 'YALE_1'], dtype=object)

## Misc data cleaning

In [40]:
# Remove space in the age column

abide2.rename(columns={"AGE_AT_SCAN ": "AGE_AT_SCAN"}, inplace=True)

## Join dataframes

In [41]:
df = pd.concat([abide1, abide2], ignore_index=True, join="inner")

In [44]:
df.head()

Unnamed: 0,SITE_ID,SUB_ID,DX_GROUP,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,FIQ,VIQ,PIQ,...,VINELAND_COMMUNITY_V_SCALED,VINELAND_INTERPERSONAL_V_SCALED,VINELAND_PLAY_V_SCALED,VINELAND_COPING_V_SCALED,VINELAND_SOCIAL_STANDARD,VINELAND_SUM_SCORES,VINELAND_INFORMANT,EYE_STATUS_AT_SCAN,AGE_AT_MPRAGE,BMI
0,CALTECH_1,51456,1,55.4,1,R,,126.0,118.0,128.0,...,,,,,,,,2.0,,
1,CALTECH_1,51457,1,22.9,1,Ambi,,107.0,119.0,93.0,...,,,,,,,,2.0,,
2,CALTECH_1,51458,1,39.2,1,R,,93.0,80.0,108.0,...,,,,,,,,2.0,,
3,CALTECH_1,51459,1,22.8,1,R,,106.0,94.0,118.0,...,,,,,,,,2.0,,
4,CALTECH_1,51460,1,34.6,2,Ambi,,133.0,135.0,122.0,...,,,,,,,,2.0,,


In [50]:
df.loc[:, ("SITE_ID", "DX_GROUP", "SEX")].apply(pd.Series.value_counts)

Unnamed: 0,SITE_ID,DX_GROUP,SEX
1,,1060.0,1804.0
2,,1166.0,422.0
BNI_1,58.0,,
CALTECH_1,38.0,,
CMU_1,27.0,,
EMC_1,54.0,,
ETH_1,37.0,,
GU_1,106.0,,
IP_1,56.0,,
IU_1,40.0,,


In [48]:
df.to_csv("./data/composite_phenotypic_data.csv", index=False)