# Create dataset for statistical analysis
Code adapted by Aurora Berto and Isabella L.C. Mariani Wigley for PONS project (05 / 2025)

aurber@utu.fi; ilmawi@utu.fi

In [1]:
# import libraries and check versions

import numpy as np
import pandas as pd
import sklearn
import matplotlib
import scipy
import os
import sys

from itertools import product

from matplotlib import pyplot as plt

from sklearn.linear_model import ElasticNetCV

import warnings
warnings.filterwarnings("ignore")

print(sys.version)
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)
print(matplotlib.__version__)
print(scipy.__version__)

3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
1.26.4
2.3.0
1.7.0
3.10.3
1.15.3


In [2]:
# particular sklearn modules to use in ML analyses

from sklearn.model_selection import cross_validate, train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import clone

In [None]:
# path to the directory containing the data

tabularData_root = r"/path/to/data" #USER.adapt!

In [4]:
# read non-imaging data

# anthropometrics
ph_y_anthro_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/physical-health/ph_y_anthro.csv")
ph_y_anthro = pd.read_csv(ph_y_anthro_path)
ph_y_anthro = ph_y_anthro.loc[:, ["src_subject_id", "eventname", "anthro_1_height_in", "anthroweight1lb"]]

# demographics
abcd_p_demo_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/abcd-general/abcd_p_demo.csv")
abcd_p_demo = pd.read_csv(abcd_p_demo_path)
abcd_p_demo = abcd_p_demo.loc[:, ["src_subject_id", "eventname", "demo_sex_v2", "race_ethnicity"]]

# administrative info
mri_y_adm_info_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/imaging/mri_y_adm_info.csv")
mri_y_adm_info = pd.read_csv(mri_y_adm_info_path)
mri_y_adm_info = mri_y_adm_info.loc[:, ["src_subject_id", "eventname", "mri_info_visitid", "mri_info_deviceserialnumber"]]

# # longitudinal tracking
abcd_y_lt_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/abcd-general/abcd_y_lt.csv")
abcd_y_lt = pd.read_csv(abcd_y_lt_path)
abcd_y_lt = abcd_y_lt.loc[:, ["src_subject_id", "eventname", "interview_age"]]

# phys. health youth (puberty, reported by youth)
ph_y_pds_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/physical-health/ph_y_pds.csv")
ph_y_pds = pd.read_csv(ph_y_pds_path)
ph_y_pds = ph_y_pds.loc[:, ["src_subject_id", "eventname", "pds_y_ss_female_category",
                            "pds_y_ss_male_category"]]

# phys. health parent (puberty, reported by parent)
ph_p_pds_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/physical-health/ph_p_pds.csv")
ph_p_pds = pd.read_csv(ph_p_pds_path)
ph_p_pds = ph_p_pds.loc[:, ["src_subject_id", "eventname", "pds_p_ss_female_category",
                            "pds_p_ss_male_category"]]

# genetics, relatedness (twins)
gen_y_pihat_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/genetics/gen_y_pihat.csv")
gen_y_pihat = pd.read_csv(gen_y_pihat_path)
gen_y_pihat = gen_y_pihat.loc[:, ["src_subject_id", "eventname", "rel_family_id", "rel_group_id"]]
gen_y_pihat = gen_y_pihat[gen_y_pihat["eventname"] == "baseline_year_1_arm_1"]

# weight at birth
ph_p_dhx_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/physical-health/ph_p_dhx.csv")
ph_p_dhx = pd.read_csv(ph_p_dhx_path)
ph_p_dhx = ph_p_dhx.loc[:, ["src_subject_id", "eventname", "birth_weight_lbs"]]
ph_p_dhx = ph_p_dhx[ph_p_dhx["eventname"] == "baseline_year_1_arm_1"]

In [5]:
# get rs-fmri imaging tables: gordon network correlations and gordon - subcortical correlations

# gordon network correlations
mri_y_rsfmr_cor_gp_gp_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/imaging/mri_y_rsfmr_cor_gp_gp.csv")
mri_y_rsfmr_cor_gp_gp = pd.read_csv(mri_y_rsfmr_cor_gp_gp_path)

# gordon network correlations to subcortical
mri_y_rsfmr_cor_gp_aseg_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core/imaging/mri_y_rsfmr_cor_gp_aseg.csv")
mri_y_rsfmr_cor_gp_aseg = pd.read_csv(mri_y_rsfmr_cor_gp_aseg_path)

# get image inclusion recommendations
mri_y_qc_incl_path = os.path.join(tabularData_root, "abcd-data-release-5.0/core//imaging/mri_y_qc_incl.csv") 
mri_y_qc_incl = pd.read_csv(mri_y_qc_incl_path)
mri_y_qc_incl = mri_y_qc_incl.loc[:, ["src_subject_id", "eventname", "imgincl_rsfmri_include"]]

In [None]:
### Load LEiDA results
megaLEiDA_root = r"/path/to/pooledLEiDA/results" #USER.adapt!
file_name = "megaLEiDA_outcomes_to_K20_dataset.csv"

# Merge them into a full file path
file_path = os.path.join(megaLEiDA_root, file_name)

# Load the CSV file
leida_results = pd.read_csv(file_path)
leida_results = leida_results.rename(columns={"ID": "src_subject_id"})
leida_results

Unnamed: 0,site,src_subject_id,P_k2c1,P_k2c2,P_k3c1,P_k3c2,P_k3c3,P_k4c1,P_k4c2,P_k4c3,...,TR_K20_C20x11,TR_K20_C20x12,TR_K20_C20x13,TR_K20_C20x14,TR_K20_C20x15,TR_K20_C20x16,TR_K20_C20x17,TR_K20_C20x18,TR_K20_C20x19,TR_K20_C20x20
0,G010,NDAR_INV03XVEBPM,0.589947,0.410053,0.529101,0.190476,0.280423,0.457672,0.267196,0.121693,...,0.000000,0.035181,0.017590,0.0,0.035181,0.00000,0.000000,0.000000,0.0,0.879520
1,G010,NDAR_INV05LGG3GZ,0.698413,0.301587,0.552910,0.343915,0.103175,0.436508,0.171958,0.335979,...,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,0.000000
2,G010,NDAR_INV097LUBWX,0.555556,0.444444,0.476190,0.312169,0.211640,0.378307,0.198413,0.259259,...,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.100265,0.0,0.701857
3,G010,NDAR_INV0MPBK7TU,0.433862,0.566138,0.309524,0.097884,0.592593,0.179894,0.227513,0.171958,...,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,0.000000
4,G010,NDAR_INV15M33G49,0.394180,0.605820,0.269841,0.624339,0.105820,0.198413,0.317460,0.420635,...,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6619,S090,NDAR_INVZGLD1V00,0.193122,0.806878,0.132275,0.642857,0.224868,0.084656,0.568783,0.235450,...,0.000000,0.000000,0.000000,0.0,0.501326,0.00000,0.000000,0.000000,0.0,0.501326
6620,S090,NDAR_INVZKP2G8H4,0.296296,0.703704,0.164021,0.497354,0.338624,0.132275,0.386243,0.301587,...,0.055703,0.055703,0.000000,0.0,0.000000,0.00000,0.055703,0.000000,0.0,0.835544
6621,S090,NDAR_INVZM7EZFZF,0.232804,0.767196,0.105820,0.650794,0.243386,0.076720,0.277778,0.478836,...,0.000000,0.000000,0.143236,0.0,0.000000,0.00000,0.000000,0.000000,0.0,0.716180
6622,S090,NDAR_INVZT7CGM7G,0.296296,0.703704,0.230159,0.690476,0.079365,0.222222,0.343915,0.388889,...,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,0.835544


In [None]:
### Load harmonics results
harmonics_root = r"/path/to/FCH/metrics" #USER.adapt!
power_file_name = "Extracted_Harmonics_power.csv"
energy_file_name = "Extracted_Harmonics_energy.csv"

# Merge them into a full file path
power_file_path = os.path.join(harmonics_root, power_file_name)
energy_file_path = os.path.join(harmonics_root, energy_file_name)

# Load the CSV file
power_fch_results = pd.read_csv(power_file_path)
energy_fch_results = pd.read_csv(energy_file_path)

# Concatenate harmonics metrics
harmonics_results = pd.concat([power_fch_results, energy_fch_results], axis=1)
harmonics_results

Unnamed: 0,Harmonics_power1,Harmonics_power2,Harmonics_power3,Harmonics_power4,Harmonics_power5,Harmonics_power6,Harmonics_power7,Harmonics_power8,Harmonics_power9,Harmonics_power10,...,Harmonics_energy104,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113
0,1.783521,2.688885,2.055411,2.075408,1.871080,1.408708,1.196510,1.833336,1.045182,1.027416,...,462.888314,223.317183,459.664177,322.604754,276.629610,265.813110,205.953641,432.280370,321.465018,341.670575
1,2.159388,2.577299,1.988765,1.911012,1.691410,1.505946,1.159920,1.107280,1.340075,1.266035,...,172.759383,225.637499,436.564375,326.120613,284.207251,359.351516,527.488489,505.815260,251.019860,379.406635
2,2.086070,2.664903,1.695239,1.607686,1.581769,1.960928,0.953869,1.278666,1.226120,1.334289,...,405.288384,306.392533,353.350951,277.818015,219.305059,243.236308,347.189513,400.985365,585.154055,638.891133
3,2.694713,2.590958,1.957676,2.132201,0.949357,1.680173,1.227515,1.703912,1.079348,1.282134,...,126.399057,228.569720,200.756679,235.971648,229.251685,501.230809,229.224788,233.686823,570.151894,506.548170
4,2.984622,3.239145,2.204868,1.874831,1.833388,1.480845,1.089543,1.312500,1.192105,1.120488,...,160.345031,208.778713,143.962055,110.864857,158.648955,350.628159,157.635268,266.785316,157.178072,103.730407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6619,2.596752,2.938736,1.732969,2.280949,1.631087,1.449912,1.320928,1.223210,1.215662,1.051116,...,128.990872,335.886707,204.788347,149.869865,237.975286,268.545082,186.387304,172.739344,166.635459,335.924767
6620,3.076791,2.580442,2.262474,1.867507,1.505603,1.382757,1.191697,1.635833,1.420014,1.163509,...,301.496409,142.953525,256.541567,276.321372,268.295691,418.423218,229.806521,220.935554,331.364448,300.727476
6621,3.345961,2.692583,1.996797,1.966084,1.288026,1.519338,1.636538,1.789212,1.365878,0.736602,...,213.283394,333.566657,207.325726,90.202241,275.211593,250.386482,321.633412,99.068609,104.665283,338.594991
6622,3.381917,3.161010,1.822705,1.646129,1.309378,1.941497,1.311219,1.182542,1.065619,1.202104,...,206.649705,127.245289,410.279948,108.523645,221.874848,215.694092,416.430330,267.037835,171.166985,302.959327


In [8]:
# fmri tables include both A - B and B - A correlations -> greater dimensionality with redundant information 
# remove duplicate correlations

corrs = mri_y_rsfmr_cor_gp_gp.corr(numeric_only=True) # get correlations
corrs_np = corrs.to_numpy() # turn into numpy array
corrs_np = np.triu(corrs_np, k=1) # get only upper triangle (removing diagonal)
sames = np.where(corrs_np == 1.0) # rows and columns where the correlation is perfect
to_drop = list(corrs.index[sames[1]]) # use latter list to remove duplicates

# to_drop

In [9]:
mri_y_rsfmr_cor_gp_gp = mri_y_rsfmr_cor_gp_gp.drop(to_drop, axis=1) # drop duplicate correlations

mri_y_rsfmr_cor_gp_gp # subject_id, eventname, 78 inter- and 13 intranetwork correlations

Unnamed: 0,src_subject_id,eventname,rsfmri_c_ngd_ad_ngd_ad,rsfmri_c_ngd_ad_ngd_cgc,rsfmri_c_ngd_ad_ngd_ca,rsfmri_c_ngd_ad_ngd_dt,rsfmri_c_ngd_ad_ngd_dla,rsfmri_c_ngd_ad_ngd_fo,rsfmri_c_ngd_ad_ngd_n,rsfmri_c_ngd_ad_ngd_rspltp,...,rsfmri_c_ngd_smh_ngd_smh,rsfmri_c_ngd_smh_ngd_smm,rsfmri_c_ngd_smh_ngd_vta,rsfmri_c_ngd_smh_ngd_vs,rsfmri_c_ngd_smm_ngd_smm,rsfmri_c_ngd_smm_ngd_vta,rsfmri_c_ngd_smm_ngd_vs,rsfmri_c_ngd_vta_ngd_vta,rsfmri_c_ngd_vta_ngd_vs,rsfmri_c_ngd_vs_ngd_vs
0,NDAR_INV003RTV85,baseline_year_1_arm_1,0.471330,0.256267,-0.076960,-0.116451,0.022202,-0.036302,-0.057183,-0.048132,...,0.314437,0.056185,-0.042587,-0.064416,0.610255,-0.042176,0.098679,0.233819,-0.155134,0.371960
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,0.279435,0.116256,0.063664,-0.024781,-0.000840,-0.023421,-0.016284,0.022696,...,0.302632,0.062337,0.023958,-0.061459,0.506645,-0.007564,0.075145,0.173929,-0.071943,0.346746
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,0.294463,0.209772,-0.071834,-0.138693,-0.035168,0.044412,-0.041321,-0.185939,...,0.288158,0.088796,0.002308,-0.060259,0.683473,0.026875,0.070566,0.257184,-0.076156,0.400397
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,0.241918,0.163942,-0.090651,-0.044039,0.012523,-0.022455,-0.005482,-0.124627,...,0.333995,0.056474,0.020649,-0.029856,0.477314,0.011466,-0.038823,0.167689,-0.092095,0.334313
4,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,0.343300,0.192000,-0.083796,-0.082260,0.007287,-0.082282,-0.042986,-0.051295,...,0.355581,0.117752,-0.042582,-0.073160,0.667292,-0.035790,0.151230,0.244665,-0.098899,0.514145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22125,NDAR_INVZZZ2ALR6,baseline_year_1_arm_1,0.411105,0.209193,-0.041473,-0.063504,-0.102791,-0.046049,-0.009828,-0.162120,...,0.302695,0.071764,-0.082954,0.000854,0.723260,0.029845,0.030236,0.301422,-0.153170,0.351791
22126,NDAR_INVZZZNB0XC,4_year_follow_up_y_arm_1,0.363400,0.090317,-0.231313,-0.150431,-0.010488,-0.140455,-0.063671,0.018483,...,0.407501,0.318553,-0.023083,0.174555,0.780099,-0.011418,0.119748,0.152578,-0.010881,0.392232
22127,NDAR_INVZZZNB0XC,baseline_year_1_arm_1,0.270290,0.147113,-0.111270,-0.093982,-0.005676,-0.083171,-0.018443,-0.074797,...,0.252571,0.072439,-0.033312,-0.040876,0.473995,-0.011148,-0.019973,0.157045,-0.105656,0.381383
22128,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1,0.411610,0.180035,-0.008353,-0.053541,-0.019124,-0.126567,-0.017896,0.031657,...,0.363188,0.107130,-0.117570,-0.004289,0.513691,-0.039455,0.000660,0.240772,-0.054846,0.203973


In [10]:
# merge tables to one dataframe

# a list of columns to merge on
merge_on = ["src_subject_id", "eventname"]

# first non-imaging data tables
data = ph_y_anthro.merge(abcd_p_demo, on=merge_on)
data = data.merge(mri_y_adm_info, on=merge_on)
data = data.merge(abcd_y_lt, on=merge_on)
data = data.merge(ph_y_pds, on=merge_on)
data = data.merge(ph_p_pds, on=merge_on)
data = data.merge(gen_y_pihat, on=merge_on)
data = data.merge(ph_p_dhx, on=merge_on)

# next imaging data tables
data = data.merge(mri_y_rsfmr_cor_gp_gp, on=merge_on)
data = data.merge(mri_y_rsfmr_cor_gp_aseg, on=merge_on)
data = data.merge(mri_y_qc_incl, on=merge_on)

# LEiDA and harmonics results
connectome_results = pd.concat([leida_results, harmonics_results], axis=1)
data = data.merge(connectome_results, on="src_subject_id")

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,pds_y_ss_female_category,...,Harmonics_energy104,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,3.0,...,127.630559,227.400249,252.183920,142.608781,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,56.5,100.0,2.0,3.0,G031_INV005V6D2C_baseline,HASHe3ce02d3,121.0,,...,197.202327,245.524361,197.758253,142.962250,348.339724,343.428212,178.832274,178.995244,255.941254,274.102060
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,,...,190.426120,270.456396,211.474409,524.022363,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,,...,193.498699,169.819136,190.305110,254.987769,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,,...,130.409553,70.583714,150.989921,149.453592,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6542,NDAR_INVZZFG6J5U,baseline_year_1_arm_1,59.2,159.3,2.0,1.0,G032_INVZZFG6J5U_baseline,HASH4b0b8b05,129.0,3.0,...,317.924671,325.369158,373.087089,287.952766,366.704236,402.474420,222.845300,303.966891,360.215854,412.811383
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,3.0,...,197.966008,253.303153,128.128514,204.454467,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,3.0,...,342.867369,164.908164,328.745927,109.322212,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479
6545,NDAR_INVZZNX6W2P,baseline_year_1_arm_1,56.0,73.0,1.0,1.0,S020_INVZZNX6W2P_baseline,HASH11ad4ed5,131.0,,...,229.312126,140.047972,294.135866,178.133345,211.757970,281.201676,433.081786,381.094465,112.344809,223.804002


In [None]:
# read sample inclusion file
included_root = r"/path/to/tabulates/abcd-data-release-5.0" #USER.adapt!
included_path = os.path.join(included_root, "Sample_variable.xlsx") 
included = pd.read_excel(included_path, sheet_name="Baseline")

included

Unnamed: 0,ID,Age,Sex at birth,Puberty,Race,Education,TMI
0,NDAR_INV003RTV85,10.0,2,3.0,1,13,14.27
1,NDAR_INV007W6H7B,10.0,1,1.5,1,19,12.71
2,NDAR_INV00BD7VDC,9.0,1,2.0,1,20,11.18
3,NDAR_INV00CY2MDM,10.0,1,2.0,1,15,14.04
4,NDAR_INV00HEV6HB,10.0,1,2.0,2,13,10.43
...,...,...,...,...,...,...,...
8994,NDAR_INVZZJ3A7BK,10.0,2,3.0,1,15,18.46
8995,NDAR_INVZZLZCKAY,9.0,2,3.0,1,15,16.16
8996,NDAR_INVZZNX6W2P,10.0,1,1.0,1,18,11.51
8997,NDAR_INVZZPKBDAC,9.0,2,3.0,1,19,12.21


In [12]:
# use screening file from Ru for excluding based on medication, health etc.

included_ids = included["ID"]

data = data[data["src_subject_id"].isin(included_ids)]

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,pds_y_ss_female_category,...,Harmonics_energy104,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,3.0,...,127.630559,227.400249,252.183920,142.608781,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,,...,190.426120,270.456396,211.474409,524.022363,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,,...,193.498699,169.819136,190.305110,254.987769,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,,...,130.409553,70.583714,150.989921,149.453592,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,,...,133.995125,179.384664,207.861206,120.541056,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,3.0,...,154.099349,172.161581,415.183771,120.998470,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,,...,306.853265,179.750506,245.080396,114.355453,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,3.0,...,197.966008,253.303153,128.128514,204.454467,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,3.0,...,342.867369,164.908164,328.745927,109.322212,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479


In [13]:
# drop twins

# drop twins preserving the first instance of [rel_family_id, rel_group_id] duplicate set
# rel_group_id: "Group ID (twins and triplets in the same family share a group ID)"

data = data.drop_duplicates(subset=["rel_family_id", "rel_group_id"], keep="first")

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,pds_y_ss_female_category,...,Harmonics_energy104,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,3.0,...,127.630559,227.400249,252.183920,142.608781,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,,...,190.426120,270.456396,211.474409,524.022363,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,,...,193.498699,169.819136,190.305110,254.987769,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,,...,130.409553,70.583714,150.989921,149.453592,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,,...,133.995125,179.384664,207.861206,120.541056,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,3.0,...,154.099349,172.161581,415.183771,120.998470,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,,...,306.853265,179.750506,245.080396,114.355453,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,3.0,...,197.966008,253.303153,128.128514,204.454467,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,3.0,...,342.867369,164.908164,328.745927,109.322212,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479


In [14]:
# drop based on QC

data = data[(data["imgincl_rsfmri_include"] == 1)]

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,pds_y_ss_female_category,...,Harmonics_energy104,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,3.0,...,127.630559,227.400249,252.183920,142.608781,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,,...,190.426120,270.456396,211.474409,524.022363,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,,...,193.498699,169.819136,190.305110,254.987769,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,,...,130.409553,70.583714,150.989921,149.453592,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,,...,133.995125,179.384664,207.861206,120.541056,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,3.0,...,154.099349,172.161581,415.183771,120.998470,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,,...,306.853265,179.750506,245.080396,114.355453,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,3.0,...,197.966008,253.303153,128.128514,204.454467,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,3.0,...,342.867369,164.908164,328.745927,109.322212,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479


In [15]:
# derive variables for total pubertal developmental scale

# average over all puberty assessment features, ignore nan
data.loc[:, "pubertal_developmental_scale"] = np.nanmean(data[["pds_y_ss_male_category",
                                                                "pds_p_ss_male_category",
                                                               "pds_y_ss_female_category",
                                                                "pds_p_ss_female_category"]], axis=1)

# drop unnecessary columns
data = data.drop(columns=["pds_y_ss_male_category", "pds_p_ss_male_category", "pds_y_ss_female_category", "pds_p_ss_female_category"])

In [16]:
# drop all-nan columns

print("Data shape before dropping all nan columns: {}".format(data.shape))

nan_cols = data.columns[data.isna().all()].tolist()

data = data.drop(nan_cols, axis=1)

print("Data shape after dropping all nan columns: {}".format(data.shape))

Data shape before dropping all nan columns: (4628, 3866)
Data shape after dropping all nan columns: (4628, 3866)


In [17]:
# drop QC features

print("Data shape before dropping QC features: {}".format(data.shape))

data = data.drop(columns=["imgincl_rsfmri_include"])

print("Data shape after dropping QC features: {}".format(data.shape))

Data shape before dropping QC features: (4628, 3866)
Data shape after dropping QC features: (4628, 3865)


In [18]:
# function for computing TMI

# convert weight in pounds to weight in kilograms
def convert_lbs_to_kg(weight_lbs):
    lbs_to_kg_ratio = 0.45359237
    weight_kg = weight_lbs*lbs_to_kg_ratio
    return weight_kg

# convert height in inches to height in meters
def convert_in_to_m(height_in):
    in_to_m_ratio = 0.0254
    height_m = height_in*in_to_m_ratio
    return height_m

# Tri-ponderal mass index (TMI), calculated as weight (kg)/height (m3)
# first conversions, next compute TMI
def compute_tmi(weight_lbs, height_in):
    weight_kg = convert_lbs_to_kg(weight_lbs)
    height_m = convert_in_to_m(height_in)
    # try/except in case of missing values
    try:
        height_m3 = height_m**3 # height in meters cubed
        tmi = weight_kg/height_m3
    except:
        tmi = np.nan
    return tmi

In [19]:
# compute and assign TMI to a column

data.loc[:, "triponderal_mass_index"] = data.apply(lambda row: compute_tmi(row.anthroweight1lb, 
                                                                           row.anthro_1_height_in), axis=1)

data = data.dropna(subset=["triponderal_mass_index"]) # drop missing TMIs if any

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,rel_family_id,...,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113,pubertal_developmental_scale,triponderal_mass_index
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,8781,...,252.183920,142.608781,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499,3.0,14.272570
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,3810,...,211.474409,524.022363,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102,2.0,11.182072
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,5355,...,190.305110,254.987769,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535,2.0,14.042368
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,2257,...,150.989921,149.453592,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885,2.0,10.416792
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,6069,...,207.861206,120.541056,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464,1.0,15.748694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,9345,...,415.183771,120.998470,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270,3.0,16.590635
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,8433,...,245.080396,114.355453,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122,2.0,10.447950
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,9346,...,128.128514,204.454467,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716,3.0,18.464142
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,9347,...,328.745927,109.322212,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479,3.0,16.162882


In [20]:
# check ranges of height and weight for outliers
print(convert_in_to_m(data["anthro_1_height_in"].min()))
print(convert_in_to_m(data["anthro_1_height_in"].max()))
print(convert_lbs_to_kg(data["anthroweight1lb"].min()))
print(convert_lbs_to_kg(data["anthroweight1lb"].max()))

0.83058
1.778
19.50447191
97.06876718000001


In [21]:
# function to convert weight in pounds to weight in kilograms
def convert_lbs_to_g(weight_lbs):
    lbs_to_g_ratio = 453.59237
    weight_g = weight_lbs*lbs_to_g_ratio
    return weight_g

In [22]:
# compute and assign birth weight in gramms to a column

data.loc[:, "birth_weight_g"] = data.apply(lambda row: convert_lbs_to_g(row.birth_weight_lbs), axis=1)
data = data.dropna(subset=["birth_weight_g"]) # drop missing weights if any

# drop unnecessary variables
data = data.drop(columns=["birth_weight_lbs"])

In [23]:
# create batch_id column from mri_info_visitid four first characters

data.loc[:, "batch_id"] = data.apply(lambda row: row.mri_info_visitid[:4], axis=1)

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,rel_family_id,...,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113,pubertal_developmental_scale,triponderal_mass_index,birth_weight_g,batch_id
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,8781,...,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499,3.0,14.272570,3175.14659,S042
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,3810,...,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102,2.0,11.182072,3628.73896,S090
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,5355,...,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535,2.0,14.042368,2721.55422,S021
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,2257,...,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885,2.0,10.416792,2721.55422,S012
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,6069,...,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464,1.0,15.748694,3175.14659,S011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,9345,...,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270,3.0,16.590635,3628.73896,S042
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,8433,...,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122,2.0,10.447950,2721.55422,S076
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,9346,...,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716,3.0,18.464142,3628.73896,S042
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,9347,...,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479,3.0,16.162882,3175.14659,S042


In [24]:
# lose the pilot site

data = data[data["batch_id"] != "G054"]

data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,rel_family_id,...,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113,pubertal_developmental_scale,triponderal_mass_index,birth_weight_g,batch_id
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,8781,...,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499,3.0,14.272570,3175.14659,S042
2,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,3810,...,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102,2.0,11.182072,3628.73896,S090
3,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,5355,...,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535,2.0,14.042368,2721.55422,S021
4,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,2257,...,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885,2.0,10.416792,2721.55422,S012
5,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,6069,...,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464,1.0,15.748694,3175.14659,S011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6540,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,9345,...,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270,3.0,16.590635,3628.73896,S042
6541,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,8433,...,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122,2.0,10.447950,2721.55422,S076
6543,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,9346,...,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716,3.0,18.464142,3628.73896,S042
6544,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,9347,...,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479,3.0,16.162882,3175.14659,S042


In [25]:
## Save the dataset
data.to_csv("main_dataset_rsfMRI_FCH_LEiDA_demo.csv", index=False)