In [None]:
import csv
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
import openpyxl

# First-level models

## Body phenotypes

In [None]:
# Define modalities
###############################################################
modalities_body = [
'immune',
'renalhepatic',
'metabolic',
'cardiopulmonary',
'musculoskeletal',
'bone_densitometry',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest',
'body_composition_by_impedance',
'body_composition_dxa',
'bone_dxa',
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
'hearing'
]

###############################################################
# Assign groups
cardiopulmonary = [
'cardiopulmonary',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest'
]

body_composition = [
'body_composition_by_impedance',
'body_composition_dxa',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
]

musculoskeletal = [
'musculoskeletal',
'bone_dxa',
'bone_densitometry']

renal_hepatic = [
'renalhepatic',
'kidneys_mri',
'liver_mri']

immune = ['immune']
metabolic = ['metabolic']

print('N body modalities:', len(modalities_body))

N body modalities: 19


In [3]:
# Rename modalities
modality_nonbrain_names = {
'diet': 'Diet',
'smoking': 'Smoking',
'sexual_factors': 'Sexual Factors',
'sun_i0': 'Sux Exposure',
'alcohol': 'Alcohol',
'electronic_device_use': 'Electronic Device Use',
'activity_touchscreen': 'Activity: Daily Behaviour',
'accelerometry': 'Activity: Accelerometry',
'activity_MET': 'Activity: MET',
'activity_byrecall_i3': 'Activity: Yesterday (by recall)',
'hearing': 'Hearing',
'sleep_ac': 'Sleep',
'localenv': 'Local Environment',
'immune': 'Immune',
'renalhepatic': 'Renal & Hepatic',
'metabolic': 'Metabolic',
'cardiopulmonary': 'Cardiopulmonary',
'musculoskeletal': 'Musculoskeletal',
'bone_densitometry': 'Bone Size, Mineral and Density by DXA',
'pwa': 'Pulse Wave Analysis',
'heart_mri': 'Heart MRI',
'carotid_ultrasound': 'Carotid Ultrasound',
'arterial_stiffness': 'Arterial Stiffness',
'ecg_rest': 'ECG at Rest',
'body_composition_by_impedance': 'Body Composition by Impedance',
'body_composition_dxa': 'Body composition by DXA',
'bone_dxa': 'Bone DXA',
'kidneys_mri': 'Kidney MRI',
'liver_mri': 'Liver MRI',
'abdominal_composition_mri_18_vars': 'Abdominal Composition by MRI',
'abdominal_organ_composition_mri_13_vars': 'Abdominal Organ Composition by MRI'
}

In [None]:
# Pool all results together: Body modalities
base_path = '/UK_BB/brainbody/lifestyle-envir-body/'
data_path = '/UK_BB/brainbody/body/'

five_folds = []
folds = range(0,5)
algorithm = 'XGB'

for modality in modalities_body:
    n_features_list = []
    n_train_list = []
    n_test_list = []
    
    for fold in folds:
        # Read the result file
        result = pd.read_csv(os.path.join(base_path, f'folds/fold_{fold}/models/{modality}_{algorithm}_result_fold_{fold}.csv'))
        
        # Get number of features and subjects
        test_data_path = os.path.join(data_path, f'folds/fold_{fold}/scaling/{modality}_test_scaled_fold_{fold}.csv')
        train_data_path = os.path.join(data_path, f'folds/fold_{fold}/scaling/{modality}_train_scaled_fold_{fold}.csv')
        
        n_features = pd.read_csv(test_data_path).shape[1]
        n_train = pd.read_csv(train_data_path).shape[0]
        n_test = pd.read_csv(test_data_path).shape[0]
        
        result['N Features'] = n_features
        result['N Train'] = n_train
        result['N Test'] = n_test
        
        # Rename modality using the dictionary
        modality_rename = modality_nonbrain_names.get(modality, modality)
        result['Modality'] = modality_rename
        
        # Add domain information
        if modality in cardiopulmonary:
            domain = 'Cardiopulmonary'
        elif modality in body_composition:
            domain = 'Body composition'
        elif modality in renal_hepatic:
            domain = 'Renal & Hepatic'
        elif modality in musculoskeletal:
            domain = 'Musculoskeletal'
        elif modality in immune:
            domain = 'Immune & Metabolic'
        elif modality in metabolic:
             domain = 'Immune & Metabolic'
        else:
            domain = 'Body'
        
        result['Domain'] = domain
        
        five_folds.append(result)
        
five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

# Remove underscores from column names
five_folds_all_modalities.columns = [col.replace('_', ' ') for col in five_folds_all_modalities.columns]

# Average across folds

column_formatting = {
    'Test R2': '$R$^2 Test',
    'Test Pearson r': 'Pearson $r$ Test',
    'Test MSE': '$MSE$ Test',
    'Test MAE': '$MAE$ Test',
    'Train R2': '$R$^2 Train',
    'Train Pearson r': 'Pearson $r$ Train',
    'Train MSE': '$MSE$ Train',
    'Train MAE': '$MAE$ Train',
    'N Train': '$N$ Train',
    'N Test': '$N$ Test',
    'N Features': '$N$ Features'
}


five_folds_all_modalities_mean = (
    five_folds_all_modalities
    .groupby(['Modality', 'Domain'])
    .agg({
        'Test R2': 'mean',
        'Test Pearson r': 'mean',
        'Test MSE': 'mean',
        'Test MAE': 'mean',
        'Train R2': 'mean',
        'Train Pearson r': 'mean',
        'Train MSE': 'mean',
        'Train MAE': 'mean',
        'N Train': 'mean',
        'N Test': 'mean',
        'N Features': 'first'
    })
    .round({
        'Test R2': 3,
        'Test Pearson r': 2,
        'Test MSE': 3,
        'Test MAE': 3,
        'Train R2': 3,
        'Train Pearson r': 2,
        'Train MSE': 2,
        'Train MAE': 2,
        'N Train': 0,
        'N Test': 0
    })
    .sort_values(by='Test R2', ascending=False)
    .reset_index()#.rename(columns=column_formatting)
)

five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']] = five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']].astype(int)
# Display the results
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities_mean)

glob_mod = 'body'
output_csv_path = '/UK_BB/brainbody/result/1level/XGB'

five_folds_all_modalities.to_excel(
    os.path.join(output_csv_path, f'1level_result-folds_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_csv_path, f'1level_result-mean_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)


five_folds_all_modalities = five_folds_all_modalities.sort_values(by='Test R2', ascending=False)

Unnamed: 0,Modality,Domain,Test R2,Test Pearson r,Test MSE,Test MAE,Train R2,Train Pearson r,Train MSE,Train MAE,N Train,N Test,N Features
0,Body Composition by Impedance,Body composition,0.124,0.35,0.446,0.528,0.239,0.52,0.39,0.49,23902.0,5976.0,32
1,Bone DXA,Body composition,0.099,0.31,0.458,0.535,0.202,0.48,0.41,0.5,20700.0,5175.0,68
2,Abdominal Composition by MRI,Body composition,0.082,0.29,0.453,0.533,0.141,0.39,0.42,0.52,19700.0,4925.0,17
3,Body composition by DXA,Body composition,0.08,0.28,0.471,0.544,0.175,0.45,0.42,0.51,18240.0,4560.0,43
4,Cardiopulmonary,Cardiopulmonary,0.079,0.28,0.466,0.539,0.121,0.36,0.45,0.53,16321.0,4080.0,7
5,Musculoskeletal,Musculoskeletal,0.061,0.25,0.459,0.535,0.132,0.4,0.42,0.51,14607.0,3652.0,13
6,Abdominal Organ Composition by MRI,Body composition,0.058,0.24,0.461,0.538,0.119,0.37,0.43,0.52,16965.0,4241.0,12
7,Metabolic,Immune & Metabolic,0.05,0.23,0.483,0.55,0.117,0.37,0.45,0.53,14496.0,3624.0,14
8,Renal & Hepatic,Renal & Hepatic,0.048,0.22,0.485,0.552,0.103,0.35,0.46,0.54,18404.0,4601.0,16
9,Pulse Wave Analysis,Cardiopulmonary,0.045,0.21,0.487,0.552,0.078,0.29,0.47,0.54,19391.0,4848.0,18


## Brain phenotypes

In [18]:
# Define modalities
modalities_smri = [
'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area','struct_a2009s_mean_thickness','struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial',
'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2'
]

modalities_dwi = [
"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",

'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',

'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',

'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',

'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2']

modalities_rs = [
"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_Schaefer7n500p_Tian_S4',
]

modalities_brain = [
'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area','struct_a2009s_mean_thickness','struct_a2009s_volume',
'struct_dkt_area', 'struct_dkt_mean_thickness', 'struct_dkt_volume',
'struct_desikan_gw', 'struct_desikan_pial',
'struct_desikan_white_area', 'struct_desikan_white_mean_thickness', 'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2',
"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",
'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',
'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',
'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',
'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',
'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',
'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2',
"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_Schaefer7n500p_Tian_S4',
]

print('N brain modalities:', len(modalities_brain)) # 21+42+18

N brain modalities: 81


In [19]:
# Rename modalities
modality_mri_names = {
'struct_fast' : 'Regional grey matter volumes (FSL FAST)',
'struct_sub_first': 'Subcortical volumes (FSL FIRST)',

'struct_fs_aseg_mean_intensity' : 'ASEG Mean Intensity',
'struct_fs_aseg_volume' : 'ASEG Volume',


'struct_ba_exvivo_area' : 'BA ex-vivo Area',
'struct_ba_exvivo_mean_thickness' : 'BA ex-vivo Mean Thickness',
'struct_ba_exvivo_volume' : 'BA ex-vivo Volume',

'struct_a2009s_area' : 'a2009s Area',
'struct_a2009s_mean_thickness' : 'a2009s Mean Thickness',
'struct_a2009s_volume' : 'a2009s Volume',


'struct_dkt_area' : 'Desikan-Killiany-Tourville Area',
'struct_dkt_mean_thickness' : 'Desikan-Killiany-Tourville Mean Thickness',
'struct_dkt_volume' : 'Desikan-Killiany-Tourville Volume',


'struct_desikan_gw' : 'Desikan Grey/White Matter Contrast',
'struct_desikan_pial' : 'Desikan Pial',

'struct_desikan_white_area' : 'Desikan White Matter Area',
'struct_desikan_white_mean_thickness' : 'Desikan White Matter Mean Thickness',
'struct_desikan_white_volume' : 'Desikan White Matter Volume',
"struct_subsegmentation":'Subcortical Volumetric Subsegmentation',

'add_t1' : 'Whole-brain T1w',
'add_t2' : 'Whole-brain T2w',
"dwi_FA_tbss": "FA TBSS",
"dwi_FA_prob": "FA Prob.",
"dwi_MD_tbss": "MD TBSS",
"dwi_MD_prob": "MD Prob.",
"dwi_L1_tbss": "L1 TBSS",
"dwi_L1_prob": "L1 Prob.",
"dwi_L2_tbss": "L2 TBSS",
"dwi_L2_prob": "L2 Prob.",
"dwi_L3_tbss": "L3 TBSS",
"dwi_L3_prob": "L3 Prob.",
"dwi_MO_tbss": "MO TBSS",
"dwi_MO_prob": "MO Prob.",
"dwi_OD_tbss": "OD TBSS",
"dwi_OD_prob": "OD Prob.",
"dwi_ICVF_tbss": "ICVF TBSS",
"dwi_ICVF_prob": "ICVF Prob.",
"dwi_ISOVF_tbss": "ISOVF TBSS",
"dwi_ISOVF_prob": "ISOVF Prob.",
"amplitudes_21": " 21 IC amplitudes 21 IC",
"amplitudes_55": "55 IC amplitudes 55 IC",
"full_correlation_21": "21 IC Full corr.",
"full_correlation_55": "55 ICFull corr. 55 IC",
"partial_correlation_21": " 21 ICPartial corr. 21 IC",
"partial_correlation_55": " 55 ICPartial corr. 55 IC",
# aparc Tian S1 (I)
'aparc_Tian_S1_FA_i2': 'aparc-I FA',
'aparc_Tian_S1_Length_i2': 'aparc-I Length',
'aparc_Tian_S1_SIFT2_FBC_i2': 'aparc-I SIFT2 FBC',
'aparc_Tian_S1_Streamline_Count_i2': 'aparc-I Streamline Count',

# aparc a2009s Tian S1 (I)
'aparc_a2009s_Tian_S1_FA_i2': 'aparc.a2009s-I FA',
'aparc_a2009s_Tian_S1_Length_i2': 'aparc.a2009s-I Length',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2': 'aparc.a2009s-I SIFT2 FBC',
'aparc_a2009s_Tian_S1_Streamline_Count_i2': 'aparc.a2009s-I Streamline Count',

# Glasser Tian S1 (I)
'Glasser_Tian_S1_FA_i2': 'Glasser-I FA',
'Glasser_Tian_S1_Length_i2': 'Glasser-I Length',
'Glasser_Tian_S1_SIFT2_FBC_i2': 'Glasser-I SIFT2 FBC',
'Glasser_Tian_S1_Streamline_Count_i2': 'Glasser-I Streamline Count',

# Glasser Tian S4 (IV)
'Glasser_Tian_S4_FA_i2': 'Glasser-IV FA',
'Glasser_Tian_S4_Length_i2': 'Glasser-IV Length',
'Glasser_Tian_S4_SIFT2_FBC_i2': 'Glasser-IV SIFT2 FBC',
'Glasser_Tian_S4_Streamline_Count_i2': 'Glasser-IV Streamline Count',

# Schaefer7n1000p Tian S4 (IV) (in reality: Schaefer7n200p Tian S1)
'Schaefer7n1000p_Tian_S4_FA_i2': 'Schaefer7n200p-I FA', #'Schaefer7n1000p-IV FA',
'Schaefer7n1000p_Tian_S4_Length_i2': 'Schaefer7n200p-I Length',#'Schaefer7n1000p-IV Length',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n200p-I SIFT2 FBC',#'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2': 'Schaefer7n200p-I Streamline Count', #'Schaefer7n1000p-IV Streamline Count'

# Schaefer7n200p Tian S4 (IV) (in reality: Schaefer7n500p Tian S4)
'Schaefer7n200p_Tian_S1_FA_i2': 'Schaefer7n500p-IV FA',
'Schaefer7n200p_Tian_S1_Length_i2': 'Schaefer7n500p-IV Length',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2': 'Schaefer7n500p-IV SIFT2 FBC',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2': 'Schaefer7n500p-IV Streamline Count',

# Schaefer7n500p Tian S4 (IV) (in reality: Schaefer7n1000p Tian S4)
'Schaefer7n500p_Tian_S4_FA_i2': 'Schaefer7n1000p-IV FA',
'Schaefer7n500p_Tian_S4_Length_i2': 'Schaefer7n1000p-IV Length',
'Schaefer7n500p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n500p_Tian_S4_Streamline_Count_i2': 'Schaefer7n1000p-IV Streamline Count',

# Resting state 
'full_correlation_aparc_a2009s_Tian_S1' : 'aparc.a2009s-I Full Corr.',
'full_correlation_aparc_Tian_S1': 'aparc-I Full Corr.',
'full_correlation_Glasser_Tian_S1': 'Glasser-I Full Corr.',
'full_correlation_Glasser_Tian_S4': 'Glasser-IV Full Corr.',
'full_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Full Corr.',
'full_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Full Corr.',
'partial_correlation_aparc_a2009s_Tian_S1': 'aparc.a2009s-I Partial Corr.',
'partial_correlation_aparc_Tian_S1': 'aparc-I Partial Corr.',
'partial_correlation_Glasser_Tian_S1': 'Glasser-I Partial Corr.',
'partial_correlation_Glasser_Tian_S4': 'Glasser-IV Partial Corr.',
'partial_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Partial Corr.',
'partial_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Partial Corr.'
}

MISMATCH BETWEEN FOLDER NAME IN UKBB SHOWCASE AND FILES IN STRUCTURAL DTI MATRICES:

31024_Schaefer7n1000p_Tian_S4
- expected: 1000 + 54 = 1054 structures
- in fact: 216 structures = 200 + 16 (S1)

 = Schaefer7n200p_Tian_S1

31025_Schaefer7n200p_Tian_S1: 
- expected: 200 + 16 = 216 structures
- in fact: 554 structures = 500 + 54 (S4) 

= Schaefer7n500p_Tian_S4

31026_Schaefer7n500p_Tian_S4
- expected: 500 + 54 = 516 structures
- in fact: 1054 structures = 1000 + 54 (S4)

= Schaefer7n1000p_Tian_S4

In [None]:
# Pool all results together: Brain MRI modalities
base_path = '/UK_BB/brainbody/brain/'

five_folds = []
folds = range(0,5)
algorithm = 'XGB'

def count_csv_rows(file_path):
    with open(file_path) as f:
        return sum(1 for row in csv.reader(f)) - 1  # subtract header

def count_csv_columns(file_path):
    with open(file_path) as f:
        return len(next(csv.reader(f)))  # read header row

for modality in modalities_brain:
    n_features_list = []
    n_train_list = []
    n_test_list = []
    
    for fold in folds:
        # Read the result file
        result = pd.read_csv(os.path.join(base_path, f'folds/fold_{fold}/models/{modality}_{algorithm}_result_fold_{fold}.csv'))

        test_data_path = os.path.join(base_path, f'folds/fold_{fold}/scaling/{modality}_test_deconf_fold_{fold}.csv')
        train_data_path = os.path.join(base_path, f'folds/fold_{fold}/scaling/{modality}_train_deconf_fold_{fold}.csv')
        
        n_features = count_csv_columns(test_data_path)
        n_train = count_csv_rows(train_data_path)
        n_test = count_csv_rows(test_data_path)

        
        result['N Features'] = n_features
        result['N Train'] = n_train
        result['N Test'] = n_test
        
        # Rename modality using the dictionary
        modality_rename = modality_mri_names.get(modality, modality)
        result['Modality'] = modality_rename
        
        # Add domain information
        if modality in modalities_smri:
            domain = 'sMRI'
        elif modality in modalities_dwi:
            domain = 'dwMRI'
        elif modality in modalities_rs:
            domain = 'rsMRI'
        else:
            domain = 'Unknown'
        
        result['Domain'] = domain
        
        five_folds.append(result)
        
five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

# Remove underscores from column names
five_folds_all_modalities.columns = [col.replace('_', ' ') for col in five_folds_all_modalities.columns]

# Average across folds
column_formatting = {
    'Test R2': '$R$^2 Test',
    'Test Pearson r': 'Pearson $r$ Test',
    'Test MSE': '$MSE$ Test',
    'Test MAE': '$MAE$ Test',
    'Train R2': '$R$^2 Train',
    'Train Pearson r': 'Pearson $r$ Train',
    'Train MSE': '$MSE$ Train',
    'Train MAE': '$MAE$ Train',
    'N Train': '$N$ Train',
    'N Test': '$N$ Test',
    'N Features': '$N$ Features'
}

five_folds_all_modalities_mean = (
    five_folds_all_modalities
    .groupby(['Modality', 'Domain'])
    .agg({
        'Test R2': 'mean',
        'Test Pearson r': 'mean',
        'Test MSE': 'mean',
        'Test MAE': 'mean',
        'Train R2': 'mean',
        'Train Pearson r': 'mean',
        'Train MSE': 'mean',
        'Train MAE': 'mean',
        'N Train': 'mean',
        'N Test': 'mean',
        'N Features': 'first'
    })
    .round({
        'Test R2': 3,
        'Test Pearson r': 2,
        'Test MSE': 3,
        'Test MAE': 3,
        'Train R2': 3,
        'Train Pearson r': 2,
        'Train MSE': 2,
        'Train MAE': 2,
        'N Train': 0,
        'N Test': 0
    })
    .sort_values(by='Test R2', ascending=False)
    .reset_index()#.rename(columns=column_formatting)
)

five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']] = five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']].astype(int)
# Display the results
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities_mean)

glob_mod = 'brain'
output_csv_path = '/UK_BB/brainbody/result/1level/XGB'

five_folds_all_modalities.to_excel(
    os.path.join(output_csv_path, f'1level_result-folds_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_csv_path, f'1level_result-mean_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities = five_folds_all_modalities.sort_values(by='Test R2', ascending=False)
five_folds_all_modalities = five_folds_all_modalities.rename()

Unnamed: 0,Modality,Domain,Test R2,Test Pearson r,Test MSE,Test MAE,Train R2,Train Pearson r,Train MSE,Train MAE,N Train,N Test,N Features
0,Schaefer7n200p_Tian_S1_Streamline_Count_i2,dwMRI,0.186,0.43,0.404,0.502,0.513,0.76,0.24,0.39,20673.0,5168.0,153735
1,Schaefer7n200p_Tian_S1_SIFT2_FBC_i2,dwMRI,0.184,0.43,0.405,0.503,0.463,0.72,0.27,0.41,20673.0,5168.0,153735
2,Schaefer7n200p_Tian_S1_FA_i2,dwMRI,0.18,0.42,0.407,0.505,0.455,0.72,0.27,0.41,20673.0,5168.0,153735
3,Glasser_Tian_S4_Streamline_Count_i2,dwMRI,0.177,0.42,0.409,0.505,0.419,0.68,0.29,0.42,20673.0,5168.0,85905
4,Schaefer7n200p_Tian_S1_Length_i2,dwMRI,0.172,0.41,0.411,0.507,0.517,0.77,0.24,0.39,20673.0,5168.0,153735
5,Glasser_Tian_S4_SIFT2_FBC_i2,dwMRI,0.17,0.41,0.411,0.506,0.382,0.65,0.31,0.44,20673.0,5168.0,85905
6,Glasser_Tian_S4_Length_i2,dwMRI,0.157,0.4,0.418,0.511,0.351,0.63,0.32,0.45,20687.0,5154.0,85905
7,Glasser_Tian_S4_FA_i2,dwMRI,0.151,0.39,0.422,0.513,0.279,0.57,0.36,0.47,20673.0,5168.0,85905
8,Glasser_Tian_S1_FA_i2,dwMRI,0.148,0.39,0.423,0.514,0.274,0.56,0.36,0.48,20674.0,5169.0,70876
9,Glasser_Tian_S1_Streamline_Count_i2,dwMRI,0.148,0.39,0.423,0.514,0.274,0.57,0.36,0.48,20674.0,5169.0,70876


In [None]:
# Rename modalities
five_folds_all_modalities['Modality'] = five_folds_all_modalities['Modality'].map(modality_mri_names).fillna(five_folds_all_modalities['Modality'])
five_folds_all_modalities_mean['Modality'] = five_folds_all_modalities_mean['Modality'].map(modality_mri_names).fillna(five_folds_all_modalities_mean['Modality'])

# Save the updated DataFrames (optional - only if you want to overwrite the previous files)
output_csv_path = '/UK_BB/brainbody/result/1level/XGB'
glob_mod = 'brain'

five_folds_all_modalities.to_excel(
    os.path.join(output_csv_path, f'1level_result-folds_{glob_mod}_renamed.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_csv_path, f'1level_result-mean_{glob_mod}_renamed.xlsx'),
    index=False,
    engine='openpyxl'
)

# Second-level model (stacked)

## Brain

In [None]:
# Pool all results together: all MRI
five_folds = []
folds = range(0,5)

modalities = [
'smri',
'dwi',
'rs',
'allmri'
]

modality_names = {
'smri': 'sMRI',
'dwi': 'dwMRI',
'rs': 'rsMRI',
'allmri': '3 Brain MRI Modalities'
}

import csv

def count_csv_rows(file_path):
    with open(file_path) as f:
        return sum(1 for row in csv.reader(f)) - 1  # subtract header

def count_csv_columns(file_path):
    with open(file_path) as f:
        return len(next(csv.reader(f)))  # read header row

algorithm = 'rf' #'rf' xgb
base_path = '/UK_BB/brainbody'
glob_mod = 'brain_stack'  # Added global mod name for saving
output_result_path = '/UK_BB/brainbody/result/2level'

for modality in modalities:
    stacking_path = os.path.join(base_path, f'stacking/brain/{modality}')

    for fold in folds:
        # Read the result file
        result = pd.read_csv(os.path.join(stacking_path, f'folds/fold_{fold}/models/{modality}_{algorithm}_stacked_result_fold_{fold}.csv'))
            
        # Get number of features and subjects
        test_data_path = (os.path.join(stacking_path, f'features_test_level1_stacked_inner/features_test_level1_inner_g_matched_fold_{fold}.csv'))
        train_data_path = (os.path.join(stacking_path, f'features_train_level1_stacked_inner/features_train_level1_inner_g_matched_fold_{fold}.csv'))
        
        # Using the more efficient counting functions
        n_features = count_csv_columns(test_data_path) - 2  # subtracting eid and g columns
        n_test = count_csv_rows(test_data_path)
        n_train = count_csv_rows(train_data_path)
            
        result['N Features'] = n_features
        result['N Train'] = n_train
        result['N Test'] = n_test
            
        # Rename modality using the dictionary
        modality_rename = modality_names.get(modality, modality)
        result['Modality'] = modality_rename
            
        # Add domain information
        if modality == 'smri':
            domain = 'sMRI'
        elif modality == 'dwi':
            domain = 'dwMRI'
        elif modality == 'rs':
            domain = 'rsMRI'
        elif modality == 'allmri':
            domain = 'MRI all'
        else:
            domain = 'Unknown'
            
        result['Domain'] = domain
            
        five_folds.append(result)
            
five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

# Remove underscores from column names
five_folds_all_modalities.columns = [col.replace('_', ' ') for col in five_folds_all_modalities.columns]

# Average across folds
column_formatting = {
    'Test R2': '$R$^2 Test',
    'Test Pearson r': 'Pearson $r$ Test',
    'Test MSE': '$MSE$ Test',
    'Test MAE': '$MAE$ Test',
    'Train R2': '$R$^2 Train',
    'Train Pearson r': 'Pearson $r$ Train',
    'Train MSE': '$MSE$ Train',
    'Train MAE': '$MAE$ Train',
    'N Train': '$N$ Train',
    'N Test': '$N$ Test',
    'N Features': '$N$ Features'
}

five_folds_all_modalities_mean = (
    five_folds_all_modalities
    .groupby(['Modality', 'Domain'])
    .agg({
        'Test R2': 'mean',
        'Test Pearson r': 'mean',
        'Test MSE': 'mean',
        'Test MAE': 'mean',
        'Train R2': 'mean',
        'Train Pearson r': 'mean',
        'Train MSE': 'mean',
        'Train MAE': 'mean',
        'N Train': 'mean',
        'N Test': 'mean',
        'N Features': 'first'
    })
    .round({
        'Test R2': 5,
        'Test Pearson r': 4,
        'Test MSE': 4,
        'Test MAE': 4,
        'Train R2': 4,
        'Train Pearson r': 4,
        'Train MSE': 4,
        'Train MAE': 4,
        'N Train': 0,
        'N Test': 0
    })
    .sort_values(by='Test R2', ascending=False)
    .reset_index()#.rename(columns=column_formatting)
)

five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']] = five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']].astype(int)
# Display the results
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities_mean)

output_result_path = '/UK_BB/brainbody/result/2level'

# Save final results
five_folds_all_modalities = five_folds_all_modalities.sort_values(by='Test R2', ascending=False)

five_folds_all_modalities.to_excel(
    os.path.join(output_result_path, f'2level_result-folds_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_result_path, f'2level_result-mean_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

Unnamed: 0,Modality,Domain,Test R2,Test Pearson r,Test MSE,Test MAE,Train R2,Train Pearson r,Train MSE,Train MAE,N Train,N Test,N Features
0,3 Brain MRI Modalities,MRI all,0.22697,0.477,0.7722,0.6941,0.8353,0.9194,0.1647,0.3202,20277.0,5069.0,81
1,dwMRI,dwMRI,0.17109,0.4434,0.8284,0.7201,0.6883,0.836,0.3117,0.4436,18789.0,5162.0,42
2,rsMRI,rsMRI,0.12843,0.3708,0.8707,0.7371,0.8342,0.9159,0.1658,0.3202,20327.0,5082.0,18
3,sMRI,sMRI,0.09834,0.3371,0.9011,0.7485,0.4108,0.6519,0.5892,0.6075,21700.0,5425.0,21


## Body

In [None]:
# Pool all results together: body
five_folds = []
folds = range(0,5)

modalities = [
'body',
]

modality_names = {
'body': 'Body Physiology',
}

def count_csv_rows(file_path):
    with open(file_path) as f:
        return sum(1 for row in csv.reader(f)) - 1  # subtract header

def count_csv_columns(file_path):
    with open(file_path) as f:
        return len(next(csv.reader(f)))  # read header row

algorithm = '0' # 0 = rf ofr outer output
base_path = '/UK_BB/brainbody'
merge_type = 'outer'
glob_mod = f'body_stack_{merge_type}'
output_result_path = '/UK_BB/brainbody/result/2level'


for modality in modalities:
    stacking_path = os.path.join(base_path, f'stacking/{modality}')

    for fold in folds:
        # Read the result file
        result = pd.read_csv(os.path.join(stacking_path, f'folds/fold_{fold}/models/{modality}_{algorithm}_{merge_type}_stacked_result_fold_{fold}.csv'))
            
        # Get number of features and subjects
        test_data_path = (os.path.join(stacking_path, f'features_test_level1_stacked_outer/features_test_level1_outer_g_matched_fold_{fold}.csv'))
        train_data_path = (os.path.join(stacking_path, f'features_train_level1_stacked_outer/features_train_level1_outer_g_matched_fold_{fold}.csv'))
        
        # Using the more efficient counting functions
        n_features = count_csv_columns(test_data_path) - 2  # subtracting eid and g columns
        n_test = count_csv_rows(test_data_path)
        n_train = count_csv_rows(train_data_path)
            
        result['N Features'] = n_features
        result['N Train'] = n_train
        result['N Test'] = n_test
            
        # Rename modality using the dictionary
        modality_rename = modality_names.get(modality, modality)
        result['Modality'] = modality_rename
            
        five_folds.append(result)
            
five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

# Remove underscores from column names
five_folds_all_modalities.columns = [col.replace('_', ' ') for col in five_folds_all_modalities.columns]

# Average across folds
column_formatting = {
    'Test R2': '$R$^2 Test',
    'Test Pearson r': 'Pearson $r$ Test',
    'Test MSE': '$MSE$ Test',
    'Test MAE': '$MAE$ Test',
    'Train R2': '$R$^2 Train',
    'Train Pearson r': 'Pearson $r$ Train',
    'Train MSE': '$MSE$ Train',
    'Train MAE': '$MAE$ Train',
    'N Train': '$N$ Train',
    'N Test': '$N$ Test',
    'N Features': '$N$ Features'
}

five_folds_all_modalities_mean = (
    five_folds_all_modalities
    .groupby(['Modality'])
    .agg({
        'Test R2': 'mean',
        'Test Pearson r': 'mean',
        'Test MSE': 'mean',
        'Test MAE': 'mean',
        'Train R2': 'mean',
        'Train Pearson r': 'mean',
        'Train MSE': 'mean',
        'Train MAE': 'mean',
        'N Train': 'mean',
        'N Test': 'mean',
        'N Features': 'first'
    })
    .round({
        'Test R2': 3,
        'Test Pearson r': 2,
        'Test MSE': 3,
        'Test MAE': 3,
        'Train R2': 3,
        'Train Pearson r': 2,
        'Train MSE': 2,
        'Train MAE': 2,
        'N Train': 0,
        'N Test': 0
    })
    .sort_values(by='Test R2', ascending=False)
    .reset_index()#.rename(columns=column_formatting)
)

five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']] = five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']].astype(int)
# Display the results
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities_mean)

output_result_path = '/UK_BB/brainbody/result/2level'

# Save final results
five_folds_all_modalities = five_folds_all_modalities.sort_values(by='Test R2', ascending=False)

five_folds_all_modalities.to_excel(
    os.path.join(output_result_path, f'2level_result-folds_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_result_path, f'2level_result-mean_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

Unnamed: 0,Modality,Test R2,Test Pearson r,Test MSE,Test MAE,Train R2,Train Pearson r,Train MSE,Train MAE,N Train,N Test,N Features
0,Body Physiology,0.163,0.4,0.835,0.722,0.357,0.62,0.64,0.63,25518.0,6379.0,19


## BOdy + brain

In [None]:
# Pool all results together: brain + body
five_folds = []
folds = range(0,5)

modalities = [
'brain-body',
]

modality_names = {
'brain-body': 'Brain MRI and Body Physiology',
}

def count_csv_rows(file_path):
    with open(file_path) as f:
        return sum(1 for row in csv.reader(f)) - 1  # subtract header

def count_csv_columns(file_path):
    with open(file_path) as f:
        return len(next(csv.reader(f)))  # read header row

algorithm = '0' # 0 = rf for outer output
base_path = '/UK_BB/brainbody'
merge_type = 'outer'
glob_mod = f'brain_and_body_stack_{merge_type}'
output_result_path = '/UK_BB/brainbody/result/2level'


for modality in modalities:
    stacking_path = os.path.join(base_path, f'stacking/{modality}')

    for fold in folds:
        # Read the result file
        result = pd.read_csv(os.path.join(stacking_path, f'folds/fold_{fold}/models/{modality}_{algorithm}_{merge_type}_stacked_result_fold_{fold}.csv'))
            
        # Get number of features and subjects
        test_data_path = (os.path.join(stacking_path, f'features_test_level1_stacked_outer/features_test_level1_outer_g_matched_fold_{fold}.csv'))
        train_data_path = (os.path.join(stacking_path, f'features_train_level1_stacked_outer/features_train_level1_outer_g_matched_fold_{fold}.csv'))
        
        # Using the more efficient counting functions
        n_features = count_csv_columns(test_data_path) - 2  # subtracting eid and g columns
        n_test = count_csv_rows(test_data_path)
        n_train = count_csv_rows(train_data_path)
            
        result['N Features'] = n_features
        result['N Train'] = n_train
        result['N Test'] = n_test
            
        # Rename modality using the dictionary
        modality_rename = modality_names.get(modality, modality)
        result['Modality'] = modality_rename
            
        five_folds.append(result)
            
five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

# Remove underscores from column names
five_folds_all_modalities.columns = [col.replace('_', ' ') for col in five_folds_all_modalities.columns]

# Average across folds
column_formatting = {
    'Test R2': '$R$^2 Test',
    'Test Pearson r': 'Pearson $r$ Test',
    'Test MSE': '$MSE$ Test',
    'Test MAE': '$MAE$ Test',
    'Train R2': '$R$^2 Train',
    'Train Pearson r': 'Pearson $r$ Train',
    'Train MSE': '$MSE$ Train',
    'Train MAE': '$MAE$ Train',
    'N Train': '$N$ Train',
    'N Test': '$N$ Test',
    'N Features': '$N$ Features'
}

five_folds_all_modalities_mean = (
    five_folds_all_modalities
    .groupby(['Modality'])
    .agg({
        'Test R2': 'mean',
        'Test Pearson r': 'mean',
        'Test MSE': 'mean',
        'Test MAE': 'mean',
        'Train R2': 'mean',
        'Train Pearson r': 'mean',
        'Train MSE': 'mean',
        'Train MAE': 'mean',
        'N Train': 'mean',
        'N Test': 'mean',
        'N Features': 'first'
    })
    .round({
        'Test R2': 3,
        'Test Pearson r': 2,
        'Test MSE': 3,
        'Test MAE': 3,
        'Train R2': 3,
        'Train Pearson r': 2,
        'Train MSE': 2,
        'Train MAE': 2,
        'N Train': 0,
        'N Test': 0
    })
    .sort_values(by='Test R2', ascending=False)
    .reset_index()#.rename(columns=column_formatting)
)

five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']] = five_folds_all_modalities_mean.loc[:, ['N Test', 'N Train']].astype(int)
# Display the results
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities_mean)

output_result_path = '/UK_BB/brainbody/result/2level'

# Save final results
five_folds_all_modalities = five_folds_all_modalities.sort_values(by='Test R2', ascending=False)

five_folds_all_modalities.to_excel(
    os.path.join(output_result_path, f'2level_result-folds_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

five_folds_all_modalities_mean.to_excel(
    os.path.join(output_result_path, f'2level_result-mean_{glob_mod}.xlsx'),
    index=False,
    engine='openpyxl'
)

Unnamed: 0,Modality,Test R2,Test Pearson r,Test MSE,Test MAE,Train R2,Train Pearson r,Train MSE,Train MAE,N Train,N Test,N Features
0,Brain MRI and Body Physiology,0.218,0.47,0.781,0.698,0.706,0.85,0.29,0.4,25518.0,6379.0,100
