In [None]:
import os
import csv
import re
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Deconfound dwMRI IDPs

In [None]:
# Define phenotypes
dwi_modalities = [
    "dwi_FA_tbss", "dwi_FA_prob",
    "dwi_MD_tbss", "dwi_MD_prob",
    "dwi_L1_tbss", "dwi_L1_prob",
    "dwi_L2_tbss", "dwi_L2_prob",
    "dwi_L3_tbss", "dwi_L3_prob",
    "dwi_MO_tbss", "dwi_MO_prob",
    "dwi_OD_tbss", "dwi_OD_prob",
    "dwi_ICVF_tbss", "dwi_ICVF_prob",
    "dwi_ISOVF_tbss", "dwi_ISOVF_prob"
]

In [None]:
# Define confound regressor
def confound_regressor(features_train, features_test, confounds_train, confounds_test):
    """
    Regress out confounds from features using a linear model.
    Both features and confounds are scaled.

    Parameters:
        features_train (array-like): Training features (n_samples, n_features).
        features_test (array-like): Test features (n_samples, n_features).
        confounds_train (array-like): Training confounds (n_samples, n_confounds).
        confounds_test (array-like): Test confounds (n_samples, n_confounds).

    Returns:
        features_train_res (array-like): Confound-corrected training features (scaled).
        features_test_res (array-like): Confound-corrected test features (scaled).
    """
    # Scale features (train and test sets)
    scaler_features = StandardScaler()
    features_train = scaler_features.fit_transform(features_train)
    features_test = scaler_features.transform(features_test)
        
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train = scaler_confounds.fit_transform(confounds_train)
    confounds_test = scaler_confounds.transform(confounds_test)
        
    # Initialize & fit the linear regression model
    model = LinearRegression(fit_intercept=False)
    model.fit(confounds_train, features_train)
    
    # Predict confound effects on training and test sets
    features_train_pred = model.predict(confounds_train)
    features_test_pred = model.predict(confounds_test)
    
    # Compute residuals (confound-corrected features)
    features_train_res = features_train - features_train_pred
    features_test_res = features_test - features_test_pred

    if confounds_train.shape[1] > 0:  # If there are confounds
        corrs = [
            np.corrcoef(confounds_train[:,i], features_train_res[:,0])[0,1] 
            for i in range(confounds_train.shape[1])
        ]
        print(f"Mean |correlation| between confounds and residuals: {np.mean(np.abs(corrs)):.3f}")

    return features_train_res, features_test_res

In [None]:
# Match features to the target,  compute sample sizes, and deconfound
folds = range(5)
for modality in dwi_modalities:
    for fold in folds:
        print(f"\n=== Processing {modality}, Fold {fold} ===")
        
        # ===== 1. Path Setup =====
        base_path = '/UK_BB/brainbody/brain'
        fold_paths = {
            'main': os.path.join(base_path, f'folds/fold_{fold}'),
            'suppl': os.path.join(base_path, f'folds/fold_{fold}/suppl'),
            'scaling': os.path.join(base_path, f'folds/fold_{fold}/scaling'),
            'models': os.path.join(base_path, f'folds/fold_{fold}/models'),
            'g_pred': os.path.join(base_path, f'folds/fold_{fold}/g_pred'),
            'cognition': '/UK_BB/brainbody/cognition'
        }
        
        # Create directories
        for path in fold_paths.values():
            os.makedirs(path, exist_ok=True)

        # ===== 2. Data Loading =====
        try:
            # Load target variables
            g_train = pd.read_csv(os.path.join(fold_paths['cognition'], f'folds/fold_{fold}/g/g_train_with_id_{fold}.csv'))
            g_test = pd.read_csv(os.path.join(fold_paths['cognition'], f'folds/fold_{fold}/g/g_test_with_id_{fold}.csv'))
            
            # Load features
            features = pd.read_csv(os.path.join(base_path, f'data/dwMRI/{modality}.csv'))
            
            # Load confounds
            confounds = pd.read_csv(os.path.join(base_path, f'data/dwMRI/dwi_conf.csv'))
            print(f"Loaded {modality} confounds from: {os.path.join(base_path, f'data/dwMRI/dwi_conf.csv')}")
            
        except FileNotFoundError as e:
            print(f"Error loading files: {e}")
            continue

        # ===== 3. Data Merging =====
        # Get column lists
        feature_cols = [col for col in features.columns if col != 'eid']
        confound_cols = [col for col in confounds.columns if col != 'eid']
        target_col = [col for col in g_train.columns if col != 'eid']
        
        # Merge data
        train_data = features.merge(g_train, on='eid').merge(confounds, on='eid')
        test_data = features.merge(g_test, on='eid').merge(confounds, on='eid')
        
        # Validate merges
        if train_data.empty or test_data.empty:
            print(f"Warning: Empty DataFrame after merging for {modality} fold {fold}")
            continue

        # ===== 4. Save Raw Data =====
        train_data.to_csv(os.path.join(fold_paths['suppl'], f'{modality}_train_feat_targ_conf_fold_{fold}.csv'), index=False)
        test_data.to_csv(os.path.join(fold_paths['suppl'], f'{modality}_test_feat_targ_conf_fold_{fold}.csv'), index=False)

        train_data[['eid', 'g']].to_csv(os.path.join(fold_paths['suppl'], f'{modality}_train_targets_fold_{fold}.csv'), index=False)
        test_data[['eid', 'g']].to_csv(os.path.join(fold_paths['suppl'], f'{modality}_test_targets_fold_{fold}.csv'), index=False)

        # ===== 5. Confound Regression =====
        print(f'Regressing out confounds from {modality} features...')
        features_train, features_test = confound_regressor(
            train_data[feature_cols],
            test_data[feature_cols],
            train_data[confound_cols],
            test_data[confound_cols]
        )
        
        # Save deconfounded features
        pd.DataFrame(features_train, columns=feature_cols).to_csv(
            os.path.join(fold_paths['scaling'], f'{modality}_train_deconf_fold_{fold}.csv'), 
            index=False
        )
        pd.DataFrame(features_test, columns=feature_cols).to_csv(
            os.path.join(fold_paths['scaling'], f'{modality}_test_deconf_fold_{fold}.csv'), 
            index=False
        )

        print(f"Completed {modality} fold {fold}")
        print(f"Train shape: {train_data.shape}, Test shape: {test_data.shape}")


=== Processing dwi_FA_tbss, Fold 0 ===
Loaded dwi_FA_tbss confounds from: /media/hcs-sci-psy-narun/IBu/UK_BB/brainbody/brain/data/dwMRI/dwi_conf.csv
Regressing out confounds from dwi_FA_tbss features...
Mean |correlation| between confounds and residuals: 0.000
Completed dwi_FA_tbss fold 0
Train shape: (21555, 63), Test shape: (5389, 63)

=== Processing dwi_FA_tbss, Fold 1 ===
Loaded dwi_FA_tbss confounds from: /media/hcs-sci-psy-narun/IBu/UK_BB/brainbody/brain/data/dwMRI/dwi_conf.csv
Regressing out confounds from dwi_FA_tbss features...
Mean |correlation| between confounds and residuals: 0.000
Completed dwi_FA_tbss fold 1
Train shape: (21518, 63), Test shape: (5426, 63)

=== Processing dwi_FA_tbss, Fold 2 ===
Loaded dwi_FA_tbss confounds from: /media/hcs-sci-psy-narun/IBu/UK_BB/brainbody/brain/data/dwMRI/dwi_conf.csv
Regressing out confounds from dwi_FA_tbss features...
Mean |correlation| between confounds and residuals: 0.000
Completed dwi_FA_tbss fold 2
Train shape: (21556, 63), Tes