In [2]:
import csv
import os
import re
import random
import pickle
import warnings
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import textwrap
import pickle
import matplotlib as mpl
from datetime import datetime
from matplotlib_venn import venn2
from scipy.stats import pearsonr
from typing import List, Dict, Optional

In [3]:
# Define modality renaming dictionary (modality_map)
modality_map = {
'hearing': 'Hearing',
'immune': 'Immune',
'renalhepatic': 'Renal & Hepatic',
'metabolic': 'Metabolic',
'cardiopulmonary': 'Cardiopulmonary',
'musculoskeletal': 'Musculoskeletal',
'bone_densitometry': 'Bone Densitometry of Heel',
'pwa': 'Pulse Wave Analysis',
'heart_mri': 'Heart MRI',
'carotid_ultrasound': 'Carotid Ultrasound',
'arterial_stiffness': 'Arterial Stiffness',
'ecg_rest': 'ECG at Rest',
'body_composition_by_impedance': 'Body Composition by Impedance',
'body_composition_dxa': 'Body Composition by DXA',
'bone_dxa': 'Bone Size, Mineral and Density by DXA',
'kidneys_mri': 'Kidney MRI',
'liver_mri': 'Liver MRI',
'abdominal_composition_mri_18_vars': 'Abdominal Composition by MRI',
'abdominal_organ_composition_mri_13_vars': 'Abdominal Organ Composition by MRI',
'struct_fast' : 'Regional grey matter volumes (FSL FAST)',
'struct_sub_first': 'Subcortical volumes (FSL FIRST)',

'struct_fs_aseg_mean_intensity' : 'ASEG Mean Intensity',
'struct_fs_aseg_volume' : 'ASEG Volume',


'struct_ba_exvivo_area' : 'BA ex-vivo Area',
'struct_ba_exvivo_mean_thickness' : 'BA ex-vivo Mean Thickness',
'struct_ba_exvivo_volume' : 'BA ex-vivo Volume',

'struct_a2009s_area' : 'a2009s Area',
'struct_a2009s_mean_thickness' : 'a2009s Mean Thickness',
'struct_a2009s_volume' : 'a2009s Volume',


'struct_dkt_area' : 'Desikan-Killiany-Tourville Area',
'struct_dkt_mean_thickness' : 'Desikan-Killiany-Tourville Mean Thickness',
'struct_dkt_volume' : 'Desikan-Killiany-Tourville Volume',


'struct_desikan_gw' : 'Desikan Grey/White Matter Contrast',
'struct_desikan_pial' : 'Desikan Pial',

'struct_desikan_white_area' : 'Desikan White Matter Area',
'struct_desikan_white_mean_thickness' : 'Desikan White Matter Mean Thickness',
'struct_desikan_white_volume' : 'Desikan White Matter Volume',
"struct_subsegmentation":'Subcortical Volumetric Subsegmentation',

'add_t1' : 'Whole-Brain T1w',
'add_t2' : 'Whole-Brain T2w',
"dwi_FA_tbss": "FA TBSS",
"dwi_FA_prob": "FA Probabilistic",
"dwi_MD_tbss": "MD TBSS",
"dwi_MD_prob": "MD Probabilistic",
"dwi_L1_tbss": "L1 TBSS",
"dwi_L1_prob": "L1 Probabilistic",
"dwi_L2_tbss": "L2 TBSS",
"dwi_L2_prob": "L2 Probabilistic",
"dwi_L3_tbss": "L3 TBSS",
"dwi_L3_prob": "L3 Probabilistic",
"dwi_MO_tbss": "MO TBSS",
"dwi_MO_prob": "MO Probabilistic",
"dwi_OD_tbss": "OD TBSS",
"dwi_OD_prob": "OD Probabilistic",
"dwi_ICVF_tbss": "ICVF TBSS",
"dwi_ICVF_prob": "ICVF Probabilistic",
"dwi_ISOVF_tbss": "ISOVF TBSS",
"dwi_ISOVF_prob": 'ISOVF Probabilistic',
"amplitudes_21": " 21 IC Amplitudes",
"amplitudes_55": "55 IC Amplitudes",
"full_correlation_21": "21 IC Full Correlation",
"full_correlation_55": "55 IC Full Correlation",
"partial_correlation_21": " 21 IC Partial Correlation",
"partial_correlation_55": " 55 IC Partial Correlation",
# aparc Tian S1 (I)
'aparc_Tian_S1_FA_i2': 'aparc-I FA',
'aparc_Tian_S1_Length_i2': 'aparc-I Length',
'aparc_Tian_S1_SIFT2_FBC_i2': 'aparc-I SIFT2 FBC',
'aparc_Tian_S1_Streamline_Count_i2': 'aparc-I Streamline Count',

# aparc a2009s Tian S1 (I)
'aparc_a2009s_Tian_S1_FA_i2': 'aparc.a2009s-I FA',
'aparc_a2009s_Tian_S1_Length_i2': 'aparc.a2009s-I Length',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2': 'aparc.a2009s-I SIFT2 FBC',
'aparc_a2009s_Tian_S1_Streamline_Count_i2': 'aparc.a2009s-I Streamline Count',

# Glasser Tian S1 (I)
'Glasser_Tian_S1_FA_i2': 'Glasser-I FA',
'Glasser_Tian_S1_Length_i2': 'Glasser-I Length',
'Glasser_Tian_S1_SIFT2_FBC_i2': 'Glasser-I SIFT2 FBC',
'Glasser_Tian_S1_Streamline_Count_i2': 'Glasser-I Streamline Count',

# Glasser Tian S4 (IV)
'Glasser_Tian_S4_FA_i2': 'Glasser-IV FA',
'Glasser_Tian_S4_Length_i2': 'Glasser-IV Length',
'Glasser_Tian_S4_SIFT2_FBC_i2': 'Glasser-IV SIFT2 FBC',
'Glasser_Tian_S4_Streamline_Count_i2': 'Glasser-IV Streamline Count',

# Schaefer7n1000p Tian S4 (IV) (in reality: Schaefer7n200p Tian S1)
'Schaefer7n1000p_Tian_S4_FA_i2': 'Schaefer7n200p-I FA', #'Schaefer7n1000p-IV FA',
'Schaefer7n1000p_Tian_S4_Length_i2': 'Schaefer7n200p-I Length',#'Schaefer7n1000p-IV Length',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n200p-I SIFT2 FBC',#'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2': 'Schaefer7n200p-I Streamline Count', #'Schaefer7n1000p-IV Streamline Count'

# Schaefer7n200p Tian S4 (IV) (in reality: Schaefer7n500p Tian S4)
'Schaefer7n200p_Tian_S1_FA_i2': 'Schaefer7n500p-IV FA',
'Schaefer7n200p_Tian_S1_Length_i2': 'Schaefer7n500p-IV Length',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2': 'Schaefer7n500p-IV SIFT2 FBC',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2': 'Schaefer7n500p-IV Streamline Count',

# Schaefer7n500p Tian S4 (IV) (in reality: Schaefer7n1000p Tian S4)
'Schaefer7n500p_Tian_S4_FA_i2': 'Schaefer7n1000p-IV FA',
'Schaefer7n500p_Tian_S4_Length_i2': 'Schaefer7n1000p-IV Length',
'Schaefer7n500p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n500p_Tian_S4_Streamline_Count_i2': 'Schaefer7n1000p-IV Streamline Count',

# Resting state 
'full_correlation_aparc_a2009s_Tian_S1' : 'aparc.a2009s-I Full Correlation',
'full_correlation_aparc_Tian_S1': 'aparc-I Full Correlation',
'full_correlation_Glasser_Tian_S1': 'Glasser-I Full Correlation',
'full_correlation_Glasser_Tian_S4': 'Glasser-IV Full Correlation',
'full_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Full Correlation',
'full_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Full Correlation',
'partial_correlation_aparc_a2009s_Tian_S1': 'aparc.a2009s-I Partial Correlation',
'partial_correlation_aparc_Tian_S1': 'aparc-I Partial Correlation',
'partial_correlation_Glasser_Tian_S1': 'Glasser-I Partial Correlation',
'partial_correlation_Glasser_Tian_S4': 'Glasser-IV Partial Correlation',
'partial_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Partial Correlation',
'partial_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Partial Correlation',

'lifestyle-envir': 'Lifestyle & Environment',

'allmri': '3 Brain MRI Modalities Stacked',
'dwi': 'Brain dwMRI Stacked',
'smri': 'Brain sMRI Stacked',
'rs': 'Brain rsMRI Stacked',
'body': 'Body Physiology Stacked',
'body-comp': 'Body Composition Stacked',
#'cardiopulmonary': 'Cardiopulmonary stacked',
'renal-hepatic': 'Renal & Hepatic Stacked',
'lifestyle-envir': 'Lifestyle & Environment',
'brain-plus-body': '3 Brain MRI Modalities & Body Stacked',
'brain-body': 'Brain & Body Stacked',
'body-only': 'Body Physiology and Composition Stacked' 
}

# Define modality names for renaming
modality_names = {
'hearing': 'Hearing',
'immune': 'Immune',
'renalhepatic': 'Renal & Hepatic',
'metabolic': 'Metabolic',
'cardiopulmonary': 'Cardiopulmonary',
'musculoskeletal': 'Musculoskeletal',
'bone_densitometry': 'Bone Densitometry of Heel',
'pwa': 'Pulse Wave Analysis',
'heart_mri': 'Heart MRI',
'carotid_ultrasound': 'Carotid Ultrasound',
'arterial_stiffness': 'Arterial Stiffness',
'ecg_rest': 'ECG at Rest',
'body_composition_by_impedance': 'Body Composition by Impedance',
'body_composition_dxa': 'Body Composition by DXA',
'bone_dxa': 'Bone Size, Mineral and Density by DXA',
'kidneys_mri': 'Kidney MRI',
'liver_mri': 'Liver MRI',
'abdominal_composition_mri_18_vars': 'Abdominal Composition by MRI',
'abdominal_organ_composition_mri_13_vars': 'Abdominal Organ Composition by MRI',
'struct_fast' : 'Regional grey matter volumes (FSL FAST)',
'struct_sub_first': 'Subcortical volumes (FSL FIRST)',

'struct_fs_aseg_mean_intensity' : 'ASEG Mean Intensity',
'struct_fs_aseg_volume' : 'ASEG Volume',


'struct_ba_exvivo_area' : 'BA ex-vivo Area',
'struct_ba_exvivo_mean_thickness' : 'BA ex-vivo Mean Thickness',
'struct_ba_exvivo_volume' : 'BA ex-vivo Volume',

'struct_a2009s_area' : 'a2009s Area',
'struct_a2009s_mean_thickness' : 'a2009s Mean Thickness',
'struct_a2009s_volume' : 'a2009s Volume',


'struct_dkt_area' : 'Desikan-Killiany-Tourville Area',
'struct_dkt_mean_thickness' : 'Desikan-Killiany-Tourville Mean Thickness',
'struct_dkt_volume' : 'Desikan-Killiany-Tourville Volume',


'struct_desikan_gw' : 'Desikan Grey/White Matter Contrast',
'struct_desikan_pial' : 'Desikan Pial',

'struct_desikan_white_area' : 'Desikan White Matter Area',
'struct_desikan_white_mean_thickness' : 'Desikan White Matter Mean Thickness',
'struct_desikan_white_volume' : 'Desikan White Matter Volume',
"struct_subsegmentation":'Subcortical Volumetric Subsegmentation',

'add_t1' : 'Whole-Brain T1w',
'add_t2' : 'Whole-Brain T2w',
"dwi_FA_tbss": "FA TBSS",
"dwi_FA_prob": "FA Probabilistic",
"dwi_MD_tbss": "MD TBSS",
"dwi_MD_prob": "MD Probabilistic",
"dwi_L1_tbss": "L1 TBSS",
"dwi_L1_prob": "L1 Probabilistic",
"dwi_L2_tbss": "L2 TBSS",
"dwi_L2_prob": "L2 Probabilistic",
"dwi_L3_tbss": "L3 TBSS",
"dwi_L3_prob": "L3 Probabilistic",
"dwi_MO_tbss": "MO TBSS",
"dwi_MO_prob": "MO Probabilistic",
"dwi_OD_tbss": "OD TBSS",
"dwi_OD_prob": "OD Probabilistic",
"dwi_ICVF_tbss": "ICVF TBSS",
"dwi_ICVF_prob": "ICVF Probabilistic",
"dwi_ISOVF_tbss": "ISOVF TBSS",
"dwi_ISOVF_prob": 'ISOVF Probabilistic',
"amplitudes_21": " 21 IC Amplitudes",
"amplitudes_55": "55 IC Amplitudes",
"full_correlation_21": "21 IC Full Correlation",
"full_correlation_55": "55 IC Full Correlation",
"partial_correlation_21": " 21 IC Partial Correlation",
"partial_correlation_55": " 55 IC Partial Correlation",
# aparc Tian S1 (I)
'aparc_Tian_S1_FA_i2': 'aparc-I FA',
'aparc_Tian_S1_Length_i2': 'aparc-I Length',
'aparc_Tian_S1_SIFT2_FBC_i2': 'aparc-I SIFT2 FBC',
'aparc_Tian_S1_Streamline_Count_i2': 'aparc-I Streamline Count',

# aparc a2009s Tian S1 (I)
'aparc_a2009s_Tian_S1_FA_i2': 'aparc.a2009s-I FA',
'aparc_a2009s_Tian_S1_Length_i2': 'aparc.a2009s-I Length',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2': 'aparc.a2009s-I SIFT2 FBC',
'aparc_a2009s_Tian_S1_Streamline_Count_i2': 'aparc.a2009s-I Streamline Count',

# Glasser Tian S1 (I)
'Glasser_Tian_S1_FA_i2': 'Glasser-I FA',
'Glasser_Tian_S1_Length_i2': 'Glasser-I Length',
'Glasser_Tian_S1_SIFT2_FBC_i2': 'Glasser-I SIFT2 FBC',
'Glasser_Tian_S1_Streamline_Count_i2': 'Glasser-I Streamline Count',

# Glasser Tian S4 (IV)
'Glasser_Tian_S4_FA_i2': 'Glasser-IV FA',
'Glasser_Tian_S4_Length_i2': 'Glasser-IV Length',
'Glasser_Tian_S4_SIFT2_FBC_i2': 'Glasser-IV SIFT2 FBC',
'Glasser_Tian_S4_Streamline_Count_i2': 'Glasser-IV Streamline Count',

# Schaefer7n1000p Tian S4 (IV) (in reality: Schaefer7n200p Tian S1)
'Schaefer7n1000p_Tian_S4_FA_i2': 'Schaefer7n200p-I FA', #'Schaefer7n1000p-IV FA',
'Schaefer7n1000p_Tian_S4_Length_i2': 'Schaefer7n200p-I Length',#'Schaefer7n1000p-IV Length',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n200p-I SIFT2 FBC',#'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2': 'Schaefer7n200p-I Streamline Count', #'Schaefer7n1000p-IV Streamline Count'

# Schaefer7n200p Tian S4 (IV) (in reality: Schaefer7n500p Tian S4)
'Schaefer7n200p_Tian_S1_FA_i2': 'Schaefer7n500p-IV FA',
'Schaefer7n200p_Tian_S1_Length_i2': 'Schaefer7n500p-IV Length',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2': 'Schaefer7n500p-IV SIFT2 FBC',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2': 'Schaefer7n500p-IV Streamline Count',

# Schaefer7n500p Tian S4 (IV) (in reality: Schaefer7n1000p Tian S4)
'Schaefer7n500p_Tian_S4_FA_i2': 'Schaefer7n1000p-IV FA',
'Schaefer7n500p_Tian_S4_Length_i2': 'Schaefer7n1000p-IV Length',
'Schaefer7n500p_Tian_S4_SIFT2_FBC_i2': 'Schaefer7n1000p-IV SIFT2 FBC',
'Schaefer7n500p_Tian_S4_Streamline_Count_i2': 'Schaefer7n1000p-IV Streamline Count',

# Resting state 
'full_correlation_aparc_a2009s_Tian_S1' : 'aparc.a2009s-I Full Correlation',
'full_correlation_aparc_Tian_S1': 'aparc-I Full Correlation',
'full_correlation_Glasser_Tian_S1': 'Glasser-I Full Correlation',
'full_correlation_Glasser_Tian_S4': 'Glasser-IV Full Correlation',
'full_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Full Correlation',
'full_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Full Correlation',
'partial_correlation_aparc_a2009s_Tian_S1': 'aparc.a2009s-I Partial Correlation',
'partial_correlation_aparc_Tian_S1': 'aparc-I Partial Correlation',
'partial_correlation_Glasser_Tian_S1': 'Glasser-I Partial Correlation',
'partial_correlation_Glasser_Tian_S4': 'Glasser-IV Partial Correlation',
'partial_correlation_Schaefer7n200p_Tian_S1': 'Schaefer7n200p-I Partial Correlation',
'partial_correlation_Schaefer7n500p_Tian_S4': 'Schaefer7n500p-IV Partial Correlation',

'lifestyle-envir': 'Lifestyle & Environment',

'allmri': '3 Brain MRI Modalities Stacked',
'dwi': 'Brain dwMRI Stacked',
'smri': 'Brain sMRI Stacked',
'rs': 'Brain rsMRI Stacked',
'body': 'Body Physiology Stacked',
'body-comp': 'Body Composition Stacked',
#'cardiopulmonary': 'Cardiopulmonary stacked',
'renal-hepatic': 'Renal & Hepatic Stacked',
'lifestyle-envir': 'Lifestyle & Environment',
'brain-plus-body': '3 Brain MRI Modalities & Body Stacked',
'brain-body': 'Brain & Body Stacked',
'body-only': 'Body Physiology and Composition Stacked' 
}

In [4]:
# Define modalities
modalities_body = [
'immune',
'renalhepatic',
'metabolic',
'cardiopulmonary',
'musculoskeletal',
'bone_densitometry',
'pwa',
'heart_mri',
'carotid_ultrasound',
'arterial_stiffness',
'ecg_rest',
'body_composition_by_impedance',
'body_composition_dxa',
'bone_dxa',
'kidneys_mri',
'liver_mri',
'abdominal_composition_mri_18_vars', #17 vars
'abdominal_organ_composition_mri_13_vars', #12 vars
'hearing'
]

modalities_brain = [
'struct_fast',
'struct_sub_first',
'struct_fs_aseg_mean_intensity',
'struct_fs_aseg_volume',
'struct_ba_exvivo_area', 
'struct_ba_exvivo_mean_thickness',
'struct_ba_exvivo_volume',
'struct_a2009s_area',
'struct_a2009s_mean_thickness',
'struct_a2009s_volume',
'struct_dkt_area',
'struct_dkt_mean_thickness',
'struct_dkt_volume',
'struct_desikan_gw',
'struct_desikan_pial',
'struct_desikan_white_area',
'struct_desikan_white_mean_thickness',
'struct_desikan_white_volume',
'struct_subsegmentation',
'add_t1',
'add_t2',

"dwi_FA_tbss", "dwi_FA_prob",
"dwi_MD_tbss", "dwi_MD_prob",
"dwi_L1_tbss", "dwi_L1_prob",
"dwi_L2_tbss", "dwi_L2_prob",
"dwi_L3_tbss", "dwi_L3_prob",
"dwi_MO_tbss", "dwi_MO_prob",
"dwi_OD_tbss", "dwi_OD_prob",
"dwi_ICVF_tbss", "dwi_ICVF_prob",
"dwi_ISOVF_tbss", "dwi_ISOVF_prob",

'aparc_Tian_S1_FA_i2',
'aparc_Tian_S1_Length_i2',
'aparc_Tian_S1_SIFT2_FBC_i2',
'aparc_Tian_S1_Streamline_Count_i2',

'aparc_a2009s_Tian_S1_FA_i2',
'aparc_a2009s_Tian_S1_Length_i2',
'aparc_a2009s_Tian_S1_SIFT2_FBC_i2',
'aparc_a2009s_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S1_FA_i2',
'Glasser_Tian_S1_Length_i2',
'Glasser_Tian_S1_SIFT2_FBC_i2',
'Glasser_Tian_S1_Streamline_Count_i2',

'Glasser_Tian_S4_FA_i2',
'Glasser_Tian_S4_Length_i2',
'Glasser_Tian_S4_SIFT2_FBC_i2',
'Glasser_Tian_S4_Streamline_Count_i2',

'Schaefer7n200p_Tian_S1_FA_i2',
'Schaefer7n200p_Tian_S1_Length_i2',
'Schaefer7n200p_Tian_S1_SIFT2_FBC_i2',
'Schaefer7n200p_Tian_S1_Streamline_Count_i2',

'Schaefer7n1000p_Tian_S4_FA_i2',
'Schaefer7n1000p_Tian_S4_Length_i2',
'Schaefer7n1000p_Tian_S4_SIFT2_FBC_i2',
'Schaefer7n1000p_Tian_S4_Streamline_Count_i2',

"amplitudes_21",
"full_correlation_21",
"partial_correlation_21",
"amplitudes_55",
"full_correlation_55",
"partial_correlation_55",
'full_correlation_aparc_a2009s_Tian_S1',
'full_correlation_aparc_Tian_S1',
'full_correlation_Glasser_Tian_S1',
'full_correlation_Glasser_Tian_S4',
'full_correlation_Schaefer7n200p_Tian_S1',
'full_correlation_Schaefer7n500p_Tian_S4',
'partial_correlation_aparc_a2009s_Tian_S1',
'partial_correlation_aparc_Tian_S1',
'partial_correlation_Glasser_Tian_S1',
'partial_correlation_Glasser_Tian_S4',
'partial_correlation_Schaefer7n200p_Tian_S1',
'partial_correlation_Schaefer7n500p_Tian_S4'
]

In [None]:
# Define a function that renamed columns based on modality map
def clean_modality_name(name, prefix_to_remove=None):

    clean_name = name
    
    # Remove specified prefix(es) if provided
    if prefix_to_remove is not None:
        if isinstance(prefix_to_remove, str):
            # Single prefix
            if clean_name.startswith(prefix_to_remove):
                clean_name = clean_name.replace(prefix_to_remove, '', 1)
        elif isinstance(prefix_to_remove, list):
            # Multiple prefixes - remove each one that matches
            for prefix in prefix_to_remove:
                if clean_name.startswith(prefix):
                    clean_name = clean_name.replace(prefix, '', 1)
                    break  # Remove only the first matching prefix
    
    # Use modality_map for display name, fallback to original if not found
    return modality_map.get(clean_name, clean_name)

In [None]:
# Configuration
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
folds = range(0, 5)
base_path = '/UK_BB/brainbody'
fig_path = '/UK_BB/brainbody/figures'

# Define body modalities
modalities_body = [
    'immune',
    'renalhepatic',
    'metabolic',
    'cardiopulmonary',
    'musculoskeletal',
    'bone_densitometry',
    'pwa',
    'heart_mri',
    'carotid_ultrasound',
    'arterial_stiffness',
    'ecg_rest',
    'body_composition_by_impedance',
    'body_composition_dxa',
    'bone_dxa',
    'kidneys_mri',
    'liver_mri',
    'abdominal_composition_mri_18_vars',
    'abdominal_organ_composition_mri_13_vars',
    'hearing'
]

# Demographics confounds
demo = pd.read_csv('/UK_BB/brainbody/demographics_full.csv')
# Rename columns and count NAs
df_demo_i2 = demo[[
'eid',
'31-0.0',
'21000-0.0',
'21003-2.0',]]
demo_full = df_demo_i2.rename(columns={
'31-0.0':'Sex',
'21000-0.0':'Ethnicity',
'21003-2.0':'Age',
})

age = demo_full['Age']
#/media/hcs-sci-psy-narun
sex = demo_full['Sex']

### Compute correlation between g-factors predicted from each modality with composite body marker

*g pred stack CORR g pred each body modality*

Supplementary S7

In [None]:
# Combine g-factors predicted from the stacked body model across five folds
g_pred_stacked = []
for fold in folds:
    try:
        stacked_path = os.path.join(
            base_path,
            'stacking/body',
            'folds',
            f'fold_{fold}',
            'g_pred',
            f'body_target_pred_2nd_level_0_outer_test_fold_{fold}.csv'
        )
        df = pd.read_csv(stacked_path)
        g_pred_stacked.append(df)
    except Exception as e:
        print(f"Error loading stacked g-factor for fold {fold}: {str(e)}")

# Combine stacked predictions
g_pred_stacked = pd.concat(g_pred_stacked, axis=0, ignore_index=True)

# -------------------------------------------------------------------
# Merge modality-specific g-factor predictions
# -------------------------------------------------------------------
modality_frames = []
for modality in modalities_body:
    modality_data = []
    for fold in folds:
        try:
            modality_path = os.path.join(
                base_path,
                'lifestyle-envir-body',
                'folds',
                f'fold_{fold}',
                'g_pred',
                f'{modality}_g_pred_XGB_test_with_id_fold_{fold}.csv'
            )
            df = pd.read_csv(modality_path)
            modality_data.append(df[['eid', f'g_pred_test_{modality}']])
        except Exception as e:
            print(f"Error loading {modality} for fold {fold}: {str(e)}")
    if modality_data:
        combined_modality = pd.concat(modality_data, axis=0, ignore_index=True)
        modality_frames.append(combined_modality)

# First modality DataFrame
merged_modalities = modality_frames[0]
# Merge the rest one by one
for df in modality_frames[1:]:
    merged_modalities = pd.merge(merged_modalities, df, on='eid', how='outer')

# Merge with stacked g-factor predictions
combined = pd.merge(merged_modalities, g_pred_stacked[['eid', f'g_pred_stack_test']], on='eid', how='inner')
combined.to_csv(os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', f'g_pred_from_body_mod_g_pred_stack_combined_{timestamp}.csv'), index=False)
print(combined.shape)

(31897, 21)


In [None]:
# Compute correlations
correlations = []
for col in combined.columns:
    if col not in ['eid', 'g_pred_stack_test']:
        try:
            x = combined['g_pred_stack_test']
            y = combined[col]
            mask = ~x.isna() & ~y.isna()
            if mask.sum() >= 2:
                r, p = stats.pearsonr(x[mask], y[mask])
            else:
                r, p = np.nan, np.nan
            correlations.append({'Modality': col, 'Pearson r': r, 'p-value': p})
        except Exception as e:
            print(f"Error correlating {col}: {str(e)}")
            correlations.append({'Modality': col, 'Pearson r': np.nan, 'p-value': np.nan})

# Save results
corr_results = pd.DataFrame(correlations).sort_values('Pearson r', ascending=False).reset_index(drop=True)

# Save to CSV
corr_results.to_csv(os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'g_pred_from_body_mod_g_pred_stack_correlations.csv'), index=False)

# Apply renaming function before saving to Excel
corr_results['Modality'] = corr_results['Modality'].apply(
    lambda x: clean_modality_name(x, prefix_to_remove='g_pred_test_')
)

# Save to Excel
corr_results.to_excel(
    os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'g_pred_from_body_mod_g_pred_stack_correlations.xlsx'),
    index=False,
    engine='openpyxl'
)

print("Correlation analysis complete. Results saved to CSV and Excel.")

Correlation analysis complete. Results saved to CSV and Excel.


### Correlation between body features and g-factors predicted from each body modality

*g pred each body modality CORR scaled body features*

Supplementary S7

In [None]:
# Compute correlation between features and predicted g-factor predicted from each modality
modality_results = {}
unsorted_results = {}

# Process each body modality
for modality in modalities_body:
    features, g_pred = [], []
    
    for fold in folds:
        try:
            # Path to predicted g-factor values (consistent location)
            test_path = os.path.join(
                base_path, 
                'lifestyle-envir-body',
                'folds', 
                f'fold_{fold}', 
                'g_pred', 
                f'{modality}_g_pred_XGB_test_with_id_fold_{fold}.csv'
            )
            
            # Special path handling for hearing
            if modality == 'hearing':
                features_path = os.path.join(
                    base_path,
                    'hearing-vision',  # Different path for hearing
                    'folds',
                    f'fold_{fold}',
                    'scaling',
                    f'{modality}_test_scaled_fold_{fold}.csv'
                )
            else:
                # Standard path for other body modalities
                features_path = os.path.join(
                    base_path,
                    'body',
                    'folds',
                    f'fold_{fold}',
                    'scaling',
                    f'{modality}_test_scaled_fold_{fold}.csv'
                )

            # Read data
            g_pred_test = pd.read_csv(test_path)
            features_corrected = pd.read_csv(features_path)
            
            features.append(features_corrected)
            g_pred.append(g_pred_test)
            
        except Exception as e:
            print(f"Error processing fold {fold} for {modality}: {str(e)}")
            continue
    
    if not features or not g_pred:
        print(f"Skipping {modality} - no valid data")
        continue
    
    # Concatenate and calculate correlations
    features_concat = pd.concat(features, axis=0, ignore_index=True)
    g_pred_concat = pd.concat(g_pred, axis=0, ignore_index=True)
    
    correlations = []
    for feature in features_concat.columns:
        try:
            x = g_pred_concat[f'g_pred_test_{modality}'].values
            y = features_concat[feature].values
            mask = ~np.isnan(y)
            
            if sum(mask) >= 2:
                r_pred, p_pred = stats.pearsonr(x[mask], y[mask])
            else:
                r_pred, p_pred = np.nan, np.nan
                
            correlations.append({
                'Feature': feature,
                'Pearson r': r_pred,
                'p-value': p_pred
            })
        except Exception as e:
            print(f"Error calculating {feature} in {modality}: {str(e)}")
            correlations.append({
                'Feature': feature,
                'Pearson r': np.nan,
                'p-value': np.nan
            })
    
    # Store both unsorted and sorted versions
    df = pd.DataFrame(correlations)
    unsorted_results[modality] = df.copy()
    modality_results[modality] = df.sort_values('Pearson r', ascending=False).reset_index(drop=True)
    
    # Save individual CSV
    os.makedirs(os.path.join(base_path, 'feature_imp', 'feature_imp_body'), exist_ok=True)
    output_path = os.path.join(base_path, f'feature_imp/feature_imp_body/{modality}_corr_with_g_pred.csv')
    df.round(4).to_csv(output_path, index=False)
    print(f"Saved CSV for {modality}")

########################################################

# Save Excel files for body modalities
with pd.ExcelWriter(os.path.join(base_path, 'feature_imp/feature_imp_body', f'body_modalities_unsorted.xlsx'), 
                   engine='openpyxl') as writer:
    for modality, df in unsorted_results.items():
        try:
            # Use original modality name for sheet name, truncate to 31 chars
            sheet_name = str(modality)[:31]
            df.round(4).to_excel(writer, sheet_name=sheet_name, index=False)
        except Exception as e:
            print(f"Error saving {modality} to unsorted Excel: {str(e)}")
            continue

with pd.ExcelWriter(os.path.join(base_path, 'feature_imp/feature_imp_body', f'body_modalities_sorted.xlsx'),
                   engine='openpyxl') as writer:
    # Individual sorted sheets
    for modality, df in modality_results.items():
        try:
            sheet_name = str(modality)[:31]
            df.round(4).to_excel(writer, sheet_name=sheet_name, index=False)
        except Exception as e:
            print(f"Error saving {modality} to sorted Excel: {str(e)}")
            continue
    
    # Combined sorted sheet with pretty names
    try:
        combined_sorted = pd.concat(
            [df.assign(Modality=modality_names.get(modality, modality))  # Use get() with fallback
             for modality, df in modality_results.items()],
            axis=0, ignore_index=True
        )
        combined_sorted.round(4).to_excel(
            writer, 
            sheet_name='All_body_sorted', 
            index=False
        )
    except Exception as e:
        print(f"Error creating combined sheet: {str(e)}")

print("\nProcessing complete! Created:")
print(f"- body_modalities_unsorted.xlsx (original names)")
print(f"- body_modalities_sorted.xlsx (sorted with combined 'All_body_sorted' sheet)")

print("\nProcessing complete! Created:")
print(f"- Individual CSV files for each body modality in 'feature_imp_body' folder")
print(f"- Excel with original names, unsorted results")
print(f"- Excel with original names (sorted) + combined pretty-named sorted sheet")

### Correlation between body features and composite body marker

*g pred body stack CORR scaled body features*

Supplementary S8

In [7]:
# Define a function that renamed columns based on modality map
def clean_modality_name(name, prefix_to_remove=None):

    clean_name = name
    
    # Remove specified prefix(es) if provided
    if prefix_to_remove is not None:
        if isinstance(prefix_to_remove, str):
            # Single prefix
            if clean_name.startswith(prefix_to_remove):
                clean_name = clean_name.replace(prefix_to_remove, '', 1)
        elif isinstance(prefix_to_remove, list):
            # Multiple prefixes - remove each one that matches
            for prefix in prefix_to_remove:
                if clean_name.startswith(prefix):
                    clean_name = clean_name.replace(prefix, '', 1)
                    break  # Remove only the first matching prefix
    
    # Use modality_map for display name, fallback to original if not found
    return modality_map.get(clean_name, clean_name)

In [8]:
# Combine g-factors predicted from the stacked body model across five folds
g_pred_stacked = []
for fold in folds:
    try:
        stacked_path = os.path.join(
            base_path,
            'stacking/body',
            'folds',
            f'fold_{fold}',
            'g_pred',
            f'body_target_pred_2nd_level_0_outer_test_fold_{fold}.csv'
        )
        df = pd.read_csv(stacked_path)
        g_pred_stacked.append(df)
    except Exception as e:
        print(f"Error loading stacked g-factor for fold {fold}: {str(e)}")

# Combine stacked predictions
g_pred_stacked = pd.concat(g_pred_stacked, axis=0, ignore_index=True)

# -------------------------------------------------------------------
# Pool scaled features and merge with eid from pre-scaled files
all_features = []
for modality in modalities_body:
    modality_features = []
    for fold in folds:
        try:
            # Load scaled features
            if modality == 'hearing':
                scaled_path = os.path.join(
                    base_path,
                    'hearing-vision',
                    'folds',
                    f'fold_{fold}',
                    'scaling',
                    f'{modality}_test_scaled_fold_{fold}.csv'
                )
                prescaled_path = os.path.join(
                    base_path,
                    'hearing-vision',
                    'folds',
                    f'fold_{fold}',
                    'suppl',
                    f'{modality}_test_feat_targ_fold_{fold}.csv'
                )
            else:
                scaled_path = os.path.join(
                    base_path,
                    'body',
                    'folds',
                    f'fold_{fold}',
                    'scaling',
                    f'{modality}_test_scaled_fold_{fold}.csv'
                )
                prescaled_path = os.path.join(
                    base_path,
                    'body',
                    'folds',
                    f'fold_{fold}',
                    'suppl',
                    f'{modality}_test_feat_targ_fold_{fold}.csv'
                )

            scaled_df = pd.read_csv(scaled_path)
            prescaled_df = pd.read_csv(prescaled_path)

            # Merge eid with scaled features
            merged_df = pd.concat([prescaled_df[['eid']], scaled_df], axis=1)
            modality_features.append(merged_df)
        except Exception as e:
            print(f"Error processing {modality}, fold {fold}: {str(e)}")
    if modality_features:
        combined_modality = pd.concat(modality_features, axis=0, ignore_index=True)
        all_features.append(combined_modality)

In [9]:
# Identify duplicate feature names across all modalities
print("=== Identifying duplicate features ===")

# Collect all feature names (excluding 'eid') from each modality
all_feature_names = {}
for i, df in enumerate(all_features):
    modality = modalities_body[i]
    features = [col for col in df.columns if col != 'eid']
    all_feature_names[modality] = set(features)
    print(f"{modality}: {len(features)} features")

# Find features that appear in multiple modalities
feature_counts = {}
for modality, features in all_feature_names.items():
    for feature in features:
        if feature not in feature_counts:
            feature_counts[feature] = []
        feature_counts[feature].append(modality)

# Show duplicate features
duplicate_features = {feature: mods for feature, mods in feature_counts.items() if len(mods) > 1}
print(f"\nFound {len(duplicate_features)} features in multiple modalities:")
for feature, mods in duplicate_features.items():
    print(f"  '{feature}': {mods}")

=== Identifying duplicate features ===
immune: 32 features
renalhepatic: 16 features
metabolic: 14 features
cardiopulmonary: 7 features
musculoskeletal: 13 features
bone_densitometry: 4 features
pwa: 18 features
heart_mri: 8 features
carotid_ultrasound: 12 features
arterial_stiffness: 4 features
ecg_rest: 9 features
body_composition_by_impedance: 32 features
body_composition_dxa: 43 features
bone_dxa: 68 features
kidneys_mri: 4 features
liver_mri: 2 features
abdominal_composition_mri_18_vars: 17 features
abdominal_organ_composition_mri_13_vars: 12 features
hearing: 2 features

Found 4 features in multiple modalities:
  'Pulse rate': ['cardiopulmonary', 'arterial_stiffness']
  'Weight': ['musculoskeletal', 'body_composition_by_impedance']
  'Body mass index (BMI)': ['musculoskeletal', 'body_composition_by_impedance']
  'Trunk fat mass': ['body_composition_by_impedance', 'body_composition_dxa']


In [None]:
# Define which features need modality suffixes (for duplicates)
modality_suffix_map = {
    'body_composition_by_impedance': ' (Impedance)',
    'body_composition_dxa': ' (DXA)', 
    'musculoskeletal': ' (Musculoskeletal)',
    'cardiopulmonary': ' (Cardiopulmonary)',
    'arterial_stiffness': ' (Arterial Stiffness)'
}

# List of features that appear in multiple modalities
duplicate_features = ['Trunk fat mass', 'Body mass index (BMI)', 'Weight', 'Pulse rate']

# Add suffixes to duplicate features
for i, df in enumerate(all_features):
    modality = modalities_body[i]
    if modality in modality_suffix_map:
        suffix = modality_suffix_map[modality]
        rename_dict = {}
        for col in df.columns:
            if col != 'eid' and col in duplicate_features:
                rename_dict[col] = f"{col}{suffix}"
        if rename_dict:
            all_features[i] = df.rename(columns=rename_dict)
            print(f"Added '{suffix}' to {len(rename_dict)} features in {modality}")

# Merge all modalities column-wise on 'eid' without _x/_y suffixes for duplicates
features_all_modalities = all_features[0]
for df in all_features[1:]:
    features_all_modalities = pd.merge(features_all_modalities, df, on='eid', how='outer')

print("\n=== Checking FINAL merged dataframe ===")
print(f"Total columns in features_all_modalities: {len(features_all_modalities.columns)}")

# Check if renamed columns exist in the final dataframe
renamed_in_final = [col for col in features_all_modalities.columns if any(feature in col for feature in duplicate_features)]
print(f"Found {len(renamed_in_final)} renamed columns in features_all_modalities:")
for col in renamed_in_final:
    print(f"  {col}")

# Check if any original duplicate names still exist
original_duplicates_in_final = [col for col in features_all_modalities.columns if col in duplicate_features]
print(f"Original duplicate names still in dataframe: {original_duplicates_in_final}")

Added ' (Cardiopulmonary)' to 1 features in cardiopulmonary
Added ' (Musculoskeletal)' to 2 features in musculoskeletal
Added ' (Arterial Stiffness)' to 1 features in arterial_stiffness
Added ' (Impedance)' to 3 features in body_composition_by_impedance
Added ' (DXA)' to 1 features in body_composition_dxa

=== Checking FINAL merged dataframe ===
Total columns in features_all_modalities: 318
Found 9 renamed columns in features_all_modalities:
  Pulse rate (Cardiopulmonary)
  Body mass index (BMI) (Musculoskeletal)
  Weight (Musculoskeletal)
  Pulse rate (Arterial Stiffness)
  Weight (Impedance)
  Body mass index (BMI) (Impedance)
  Trunk fat mass (Impedance)
  Trunk fat mass (DXA)
  Weight-to-muscle ratio
Original duplicate names still in dataframe: []


In [None]:
# Merge features with stacked g-factor predictions on 'eid'
combined = pd.merge(features_all_modalities, g_pred_stacked[['eid', 'g_pred_stack_test']], on='eid', how='inner')

# Save the final combined DataFrame
output_path = os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', f'body_features_g_pred_stack_combined.csv')
combined.to_csv(output_path, index=False)

print(f"Final combined shape: {combined.shape}")

# Check if the column exists
if 'Body mass index (BMI) (Musculoskeletal)' in combined.columns:
    print("Column exists! Here are the first few values:")
    print(combined['Body mass index (BMI) (Musculoskeletal)'].head())
else:
    print("Column NOT found. Available columns with 'BMI':")
    bmi_cols = [col for col in combined.columns if 'BMI' in col]
    for col in bmi_cols:
        print(f"  '{col}'")

In [None]:
# Compute correlations with stacked g-factor
correlations = []
for col in combined.columns:
    if col not in ['eid', 'g_pred_stack_test']:
        try:
            x = combined['g_pred_stack_test']
            y = combined[col]
            mask = ~x.isna() & ~y.isna()
            if mask.sum() >= 2:
                r, p = stats.pearsonr(x[mask], y[mask])
            else:
                r, p = np.nan, np.nan
            correlations.append({'Phenotype': col, 'Pearson r': r, 'p-value': p})
        except Exception as e:
            print(f"Error correlating {col}: {str(e)}")
            correlations.append({'Phenotype': col, 'Pearson r': np.nan, 'p-value': np.nan})

# Save results
corr_results = pd.DataFrame(correlations).sort_values('Pearson r', ascending=False).reset_index(drop=True)

# Save to CSV
corr_results.to_csv(os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations.csv'), index=False)

# Apply renaming function before saving to Excel
corr_results['Phenotype'] = corr_results['Phenotype'].apply(
    lambda x: clean_modality_name(x, prefix_to_remove='g_pred_test_')
)

# Save to Excel
corr_results.to_excel(
    os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations.xlsx'),
    index=False,
    engine='openpyxl'
)

print("Correlation analysis complete. Results saved to CSV and Excel.")

Correlation analysis complete. Results saved to CSV and Excel.


In [18]:
# Stratify analysis by sex
demo = demo_full.copy()

# Merge demographics with combined data
combined_with_demo = pd.merge(combined, demo[['eid', 'Sex']], on='eid', how='left')

# Compute correlations with stacked g-factor
correlations = []
for col in combined.columns:
    if col not in ['eid', 'g_pred_stack_test']:
        try:
            x = combined['g_pred_stack_test']
            y = combined[col]
            
            # Create mask for complete cases
            mask = ~x.isna() & ~y.isna()
            sample_size = mask.sum()
            missing_count = len(x) - sample_size
            
            if sample_size >= 2:
                # Overall correlation
                r, p = stats.pearsonr(x[mask], y[mask])
                
                # Male-only correlation
                #1	Male
                #0	Female
                mask_male = mask & (combined_with_demo['Sex'] == 1)  # 1 = male
                sample_size_male = mask_male.sum()
                if sample_size_male >= 2:
                    r_male, p_male = stats.pearsonr(x[mask_male], y[mask_male])
                else:
                    r_male, p_male = np.nan, np.nan
                
                # Female-only correlation  
                mask_female = mask & (combined_with_demo['Sex'] == 0)  # 0 = female
                sample_size_female = mask_female.sum()
                if sample_size_female >= 2:
                    r_female, p_female = stats.pearsonr(x[mask_female], y[mask_female])
                else:
                    r_female, p_female = np.nan, np.nan
                    
            else:
                r, p = np.nan, np.nan
                r_male, p_male = np.nan, np.nan
                r_female, p_female = np.nan, np.nan
                sample_size_male = np.nan
                sample_size_female = np.nan
                
            correlations.append({
                'Phenotype': col, 
                'Pearson r': r, 
                'p-value': p,
                'N': sample_size,
                'N missing': missing_count,
                'Pearson r male': r_male,
                'p-value male': p_male,
                'N male': sample_size_male,
                'Pearson r female': r_female, 
                'p-value female': p_female,
                'N female': sample_size_female
            })
            
        except Exception as e:
            print(f"Error correlating {col}: {str(e)}")
            correlations.append({
                'Phenotype': col, 
                'Pearson r': np.nan, 
                'p-value': np.nan,
                'N': np.nan,
                'N missing': np.nan,
                'Pearson r male': np.nan,
                'p-value male': np.nan,
                'N male': np.nan,
                'Pearson r female': np.nan,
                'p-value female': np.nan,
                'N female': np.nan
            })

# Save results
corr_results = pd.DataFrame(correlations)#.sort_values('Pearson r', ascending=False).reset_index(drop=True)

# Calculate number of valid tests for Bonferroni correction
valid_tests = corr_results['p-value'].notna().sum()
valid_tests_male = corr_results['p-value male'].notna().sum()
valid_tests_female = corr_results['p-value female'].notna().sum()

# Apply Bonferroni correction
corr_results['p-value bonferroni'] = corr_results['p-value'].apply(
    lambda x: min(x * valid_tests, 1.0) if not pd.isna(x) else np.nan
)
corr_results['p-value male bonferroni'] = corr_results['p-value male'].apply(
    lambda x: min(x * valid_tests_male, 1.0) if not pd.isna(x) else np.nan
)
corr_results['p-value female bonferroni'] = corr_results['p-value female'].apply(
    lambda x: min(x * valid_tests_female, 1.0) if not pd.isna(x) else np.nan
)

# Print summary statistics
print(f"Total number of phenotypes analyzed: {len(corr_results)}")
print(f"Number of valid tests for Bonferroni correction: {valid_tests}")
print(f"Number of valid male tests for Bonferroni correction: {valid_tests_male}")
print(f"Number of valid female tests for Bonferroni correction: {valid_tests_female}")
print(f"Average sample size: {corr_results['N'].mean():.0f}")
print(f"Total missing values across all phenotypes: {corr_results['N missing'].sum():,}")

# Print sample size ranges
print(f"\nSample size range: {corr_results['N'].min():.0f} - {corr_results['N'].max():.0f}")
print(f"Male sample size range: {corr_results['N male'].min():.0f} - {corr_results['N male'].max():.0f}")
print(f"Female sample size range: {corr_results['N female'].min():.0f} - {corr_results['N female'].max():.0f}")

corr_results_sorted = pd.DataFrame(corr_results).sort_values('Pearson r', ascending=False).reset_index(drop=True)

# Save to CSV
corr_results.to_csv(os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations_detailed.csv'), index=False)
corr_results_sorted.to_csv(os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations_detailed_sorted.csv'), index=False)

# Apply renaming function before saving to Excel
corr_results['Phenotype'] = corr_results['Phenotype'].apply(
    lambda x: clean_modality_name(x, prefix_to_remove='g_pred_test_')
)

# Save to Excel with multiple sheets
with pd.ExcelWriter(
    os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations_detailed.xlsx'),
    engine='openpyxl'
) as writer:
    
    # Main results
    corr_results.to_excel(writer, sheet_name='All_correlations', index=False)
    
    # Male-only results
    male_results = corr_results[['Phenotype', 'Pearson r male', 'p-value male', 'p-value male bonferroni', 'N male']].copy()
    male_results = male_results.sort_values('Pearson r male', ascending=False)
    male_results.to_excel(writer, sheet_name='Male_correlations', index=False)
    
    # Female-only results  
    female_results = corr_results[['Phenotype', 'Pearson r female', 'p-value female', 'p-value female bonferroni', 'N female']].copy()
    female_results = female_results.sort_values('Pearson r female', ascending=False)
    female_results.to_excel(writer, sheet_name='Female_correlations', index=False)
    
    # Summary statistics
    summary_data = {
        'Metric': ['Total phenotypes', 'Valid tests for Bonferroni', 'Valid male tests', 'Valid female tests',
                  'Mean sample size', 'Total missing values',
                  'Min sample size', 'Max sample size',
                  'Min male sample size', 'Max male sample size',
                  'Min female sample size', 'Max female sample size'],
        'Value': [len(corr_results), valid_tests, valid_tests_male, valid_tests_female,
                 corr_results['N'].mean(), corr_results['N missing'].sum(),
                 corr_results['N'].min(), corr_results['N'].max(),
                 corr_results['N male'].min(), corr_results['N male'].max(),
                 corr_results['N female'].min(), corr_results['N female'].max()]
    }

    # Check for missing values
    print(f"Total rows in combined: {len(combined)}")
    print(f"Total rows in demo: {len(demo)}")
    print(f"Rows after merge: {len(combined_with_demo)}")
    
    # Check missing sex information
    print(f"\nMissing Sex values in combined_with_demo: {combined_with_demo['Sex'].isna().sum()}")
    print(f"Unique Sex values: {combined_with_demo['Sex'].dropna().unique()}")
    
    # Check for NaN in g_pred_stack_test
    print(f"\nMissing in g_pred_stack_test: {combined['g_pred_stack_test'].isna().sum()}")

    summary_df = pd.DataFrame(summary_data)
    summary_df.to_excel(writer, sheet_name='Summary_stats', index=False)

# Save to Excel with multiple sheets - sorted
with pd.ExcelWriter(
    os.path.join(base_path, 'feature_imp', 'feature_imp_body', 'combined', 'body_features_g_pred_stack_correlations_detailed_sorted.xlsx'),
    engine='openpyxl'
) as writer:
    
    # Main results
    corr_results_sorted.to_excel(writer, sheet_name='All_correlations', index=False)
    
    # Male-only results
    male_results = corr_results_sorted[['Phenotype', 'Pearson r male', 'p-value male', 'p-value male bonferroni', 'N male']].copy()
    male_results = male_results.sort_values('Pearson r male', ascending=False)
    male_results.to_excel(writer, sheet_name='Male_correlations', index=False)
    
    # Female-only results  
    female_results = corr_results_sorted[['Phenotype', 'Pearson r female', 'p-value female', 'p-value female bonferroni', 'N female']].copy()
    female_results = female_results.sort_values('Pearson r female', ascending=False)
    female_results.to_excel(writer, sheet_name='Female_correlations', index=False)
    
    # Summary statistics
    summary_data = {
        'Metric': ['Total phenotypes', 'Valid tests for Bonferroni', 'Valid male tests', 'Valid female tests',
                  'Mean sample size', 'Total missing values',
                  'Min sample size', 'Max sample size',
                  'Min male sample size', 'Max male sample size',
                  'Min female sample size', 'Max female sample size'],
        'Value': [len(corr_results_sorted), valid_tests, valid_tests_male, valid_tests_female,
                 corr_results_sorted['N'].mean(), corr_results_sorted['N missing'].sum(),
                 corr_results_sorted['N'].min(), corr_results_sorted['N'].max(),
                 corr_results_sorted['N male'].min(), corr_results_sorted['N male'].max(),
                 corr_results_sorted['N female'].min(), corr_results_sorted['N female'].max()]
    }
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_excel(writer, sheet_name='Summary_stats', index=False)


print("Correlation analysis complete. Results saved to CSV and Excel with multiple sheets.")
print(f"Excel file contains: All_correlations, Male_correlations, Female_correlations, Summary_stats sheets")

Total number of phenotypes analyzed: 317
Number of valid tests for Bonferroni correction: 317
Number of valid male tests for Bonferroni correction: 317
Number of valid female tests for Bonferroni correction: 317
Average sample size: 24895
Total missing values across all phenotypes: 2,219,720

Sample size range: 17492 - 30971
Male sample size range: 8441 - 15029
Female sample size range: 8482 - 15942
Total rows in combined: 31897
Total rows in demo: 502356
Rows after merge: 31897

Missing Sex values in combined_with_demo: 0
Unique Sex values: [0 1]

Missing in g_pred_stack_test: 0
Correlation analysis complete. Results saved to CSV and Excel with multiple sheets.
Excel file contains: All_correlations, Male_correlations, Female_correlations, Summary_stats sheets
