In [1]:
import os
import csv
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.api as sm
from scipy.stats import sem
from matplotlib import cm
from scipy import stats
from scipy.stats import pearsonr
import ukbiobank.utils.utils
from ukbiobank.utils import loadCsv
from ukbiobank.utils import addFields
from ukbiobank.utils.utils import fieldIdsToNames
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

In [None]:
# Upload UK Bioabank csv
csv_path = '/UK_BB/ukbbdata/ukb.csv'
ukb = ukbiobank.ukbio(ukb_csv=csv_path)

In [None]:
# Upload required fields
df_demo = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid',
31, #Sex
21003, #Age when attended assessment centre
21000 #Ethnic background
])
df_demo.to_csv('/UK_BB/brainbody/demographics_full.csv', index=False)

## Full cognitive performance sample

In [None]:
# Combine data across folds: cognitive data full sample
folds = range(0, 5)
directory = '/UK_BB/brainbody/cognition/folds'

# Create empty lists to store all data
all_train_data = []
all_test_data = []

for fold in folds:
    # Process training data
    g_train = pd.read_csv(os.path.join(directory, f'fold_{fold}', 'g', f'g_train_with_id_{fold}.csv'))
    g_train['fold'] = fold  # Add fold identifier
    g_train['split'] = 'train'  # Add split identifier
    print(g_train.shape)
    
    # Process test data
    g_test = pd.read_csv(os.path.join(directory, f'fold_{fold}', 'g', f'g_test_with_id_{fold}.csv'))
    g_test['fold'] = fold  # Add fold identifier
    g_test['split'] = 'test'  # Add split identifier
    print(g_test.shape)
    all_test_data.append(g_test)

# Combine all folds into single tables
all_test_combined_full_cog = pd.concat(all_test_data, ignore_index=True)
print(all_test_combined_full_cog.shape)

(25517, 4)
(6380, 4)
(25517, 4)
(6380, 4)
(25518, 4)
(6379, 4)
(25518, 4)
(6379, 4)
(25518, 4)
(6379, 4)
(31897, 4)


Rename columns and count NAs

In [None]:
# Rename columns and count NAs
df_demo_i2 = df_demo[[
'eid',
'31-0.0',
'21000-0.0',
'21003-2.0',]]
df_demo_i2 = df_demo_i2.rename(columns={
'31-0.0':'Sex',
'21000-0.0':'Ethnicity',
'21003-2.0':'Age',
})
demo_full_cog = pd.merge(all_test_combined_full_cog,df_demo_i2, on = 'eid')
print('NA:', demo_full_cog.isna().sum())

NA: g            0
eid          0
fold         0
split        0
Sex          0
Ethnicity    7
Age          0
dtype: int64


Display age and sex

In [72]:
print('Sample size', demo_full_cog.shape[0])
print('Mean age', demo_full_cog['Age'].mean().round(2))
print(f"SD age {demo_full_cog['Age'].std():.3f}")
print('Age max range:', demo_full_cog['Age'].max())
print('Age min range:', demo_full_cog['Age'].min())
print('Proportion of males:', (demo_full_cog['Sex'].value_counts()[1] / len(demo_full_cog['Sex']) * 100).round(2))
print('Proportion of females:', (demo_full_cog['Sex'].value_counts()[0] / len(demo_full_cog['Sex']) * 100).round(2))

Sample size 31897
Mean age 64.55
SD age 7.663
Age max range: 83.0
Age min range: 46.0
Proportion of males: 48.68
Proportion of females: 51.32


Ethnicity

In [None]:
# Count ethnicity
demo_full_cog_ethnicity = demo_full_cog.dropna()
# Get counts
counts = demo_full_cog_ethnicity['Ethnicity'].value_counts()
# Calculate percentages
percentages = demo_full_cog_ethnicity['Ethnicity'].value_counts(normalize=True).mul(100).round(2)

# Combine into one DataFrame
result = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages
}).reset_index()

# Create a mapping dictionary
ethnicity_mapping = {
1001.0:'British',
1003.0: 'Any other white background',
1002.0: 'Irish'
}
# Apply the mapping
result['Ethnicity'] = result['Ethnicity'].replace(ethnicity_mapping)
print(result)

                     Ethnicity  Count  Percentage
0                      British  29120       91.31
1   Any other white background   1054        3.31
2                        Irish    795        2.49
3                       3001.0    203        0.64
4                          6.0    150        0.47
5                       4001.0    101        0.32
6                          5.0     80        0.25
7                       4002.0     67        0.21
8                         -3.0     65        0.20
9                       2003.0     49        0.15
10                      2004.0     48        0.15
11                      3004.0     48        0.15
12                      3002.0     33        0.10
13                      2001.0     30        0.09
14                      2002.0     20        0.06
15                         1.0     13        0.04
16                      3003.0      5        0.02
17                        -1.0      4        0.01
18                      4003.0      3        0.01


## Commonality analysis sample

In [None]:
# Combine data acrosss folds
commonality_path = '/UK_BB/brainbody/commonality'
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Configuration
folds = range(0, 5)
stacking_path = '/UK_BB/brainbody/stacking'

os.makedirs(commonality_path, exist_ok=True)

modality_groups = {
    'brain': {
        'submodalities': ['allmri'],
        'pred_pattern': '{submod}_target_pred_2nd_level_rf_test_fold_{fold}.csv',
        'obs_pattern': '{submod}/features_test_level1_outer/features_test_level1_outer_g_matched_fold_{fold}.csv'
    },
    'body': {
        'submodalities': ['body'],
        'pred_pattern': '{submod}_target_pred_2nd_level_{alg}_outer_test_fold_{fold}.csv',
        'obs_pattern': 'features_test_level1_stacked_outer/features_test_level1_outer_g_matched_fold_{fold}.csv'
    },
}

# Only compare brain modalities against body modalities
r2_results = []

# Process only brain modalities
brain_config = modality_groups['brain']
body_config = modality_groups['body']

for brain_submod in brain_config['submodalities']:
    for body_submod in body_config['submodalities']:
        df_concat = []
        for fold in folds:
            try:
                print(f'\n############## FOLD {fold} - {brain_submod} vs {body_submod} ##############')
                
                # Brain prediction
                brain_pred_path = os.path.join(
                    stacking_path, 'brain', brain_submod, 'folds', 
                    f'fold_{fold}', 'g_pred',
                    brain_config['pred_pattern'].format(submod=brain_submod, fold=fold))
                
                brain_pred = pd.read_csv(brain_pred_path).rename(
                    columns={'g_pred_stack_test': f'g_pred_brain_{brain_submod}_test'})
                
                print('\nBrain predictions:')
                print('Shape:', brain_pred.shape)
                print('Columns:', brain_pred.columns.tolist())
                print('NA counts:\n', brain_pred.isna().sum())
                
                # Body prediction
                body_pred_path = os.path.join(
                        stacking_path, 'body', 'folds', f'fold_{fold}', 'g_pred',
                        f'body_target_pred_2nd_level_0_outer_test_fold_{fold}.csv')
                body_pred = pd.read_csv(body_pred_path).rename(
                        columns={'g_pred_stack_test': 'g_pred_body_test'})
                
                print('\nBody predictions:')
                print('Shape:', body_pred.shape)
                print('Columns:', body_pred.columns.tolist())
                print('NA counts:\n', body_pred.isna().sum())
                
                # Observations (brain)
                brain_obs_path = os.path.join(
                    stacking_path, 'brain',
                    brain_config['obs_pattern'].format(submod=brain_submod, fold=fold))
                g_obs = pd.read_csv(brain_obs_path)[['eid', 'g']].rename(columns={'g': 'g_obs_test'})
                
                print('\nObservations:')
                print('Shape:', g_obs.shape)
                print('Columns:', g_obs.columns.tolist())
                print('NA counts:\n', g_obs.isna().sum())
                
                # Merge data
                all_g = (brain_pred.merge(body_pred, on='eid')
                         .merge(g_obs, on='eid'))
                
                print('\nMerged data:')
                print('Shape:', all_g.shape)
                print('Columns:', all_g.columns.tolist())
                print('NA counts:\n', all_g.isna().sum())
                
                df_concat.append(all_g)
                
            except FileNotFoundError as e:
                print(f"File not found: {e}")
                continue
        
        if not df_concat:
            continue
            
        all_g = pd.concat(df_concat, axis=0, ignore_index=True)
        output_name = f'g_obs_pred_brain_{brain_submod}_vs_body_{body_submod}'
        all_g.to_csv(os.path.join(commonality_path, f'{output_name}_with_id.csv'), index=False)
        

In [None]:
# Count NAs
sample_size_commonality = pd.read_csv(os.path.join(commonality_path, f'g_obs_pred_brain_allmri_vs_body_body_with_id.csv'))
demo_commonality = pd.merge(sample_size_commonality,df_demo_i2, on = 'eid')
print('NA:', demo_full_cog.isna().sum())

NA: g            0
eid          0
fold         0
split        0
Sex          0
Ethnicity    7
Age          0
dtype: int64


Display age and sex

In [75]:
print('Sample size', demo_commonality.shape[0])
print('Mean age', demo_commonality['Age'].mean().round(2))
print(f"SD age {demo_commonality['Age'].std():.3f}")
print('Age max range:', demo_commonality['Age'].max())
print('Age min range:', demo_commonality['Age'].min())
print('Proportion of males:', (demo_commonality['Sex'].value_counts()[1] / len(demo_commonality['Sex']) * 100).round(2))
print('Proportion of females:', (demo_commonality['Sex'].value_counts()[0] / len(demo_commonality['Sex']) * 100).round(2))

Sample size 25346
Mean age 64.08
SD age 7.516
Age max range: 82.0
Age min range: 47.0
Proportion of males: 46.82
Proportion of females: 53.18


Ethnicity

In [77]:
# Count ethnicity
demo_commonality_ethnicity = demo_commonality.dropna()
# Get counts
counts = demo_commonality_ethnicity['Ethnicity'].value_counts()
# Calculate percentages
percentages = demo_commonality_ethnicity['Ethnicity'].value_counts(normalize=True).mul(100).round(2)

# Combine into one DataFrame
result = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages
}).reset_index()

# Create a mapping dictionary
ethnicity_mapping = {
1001.0:'British',
1003.0: 'Any other white background',
1002.0: 'Irish'
}
# Apply the mapping to the Ethnicity_Code column
result['Ethnicity'] = result['Ethnicity'].replace(ethnicity_mapping)
print(result)

                     Ethnicity  Count  Percentage
0                      British  23121       91.25
1   Any other white background    848        3.35
2                        Irish    642        2.53
3                       3001.0    156        0.62
4                          6.0    131        0.52
5                       4001.0     82        0.32
6                          5.0     63        0.25
7                       4002.0     50        0.20
8                         -3.0     45        0.18
9                       2003.0     44        0.17
10                      3004.0     41        0.16
11                      2004.0     38        0.15
12                      2001.0     22        0.09
13                      3002.0     21        0.08
14                      2002.0     15        0.06
15                         1.0      9        0.04
16                      3003.0      5        0.02
17                        -1.0      3        0.01
18                      4003.0      2        0.01
