# Code to run ComBat harmonization on fMRI data
Code by Isabella L.C. Mariani Wigley and Aurora Berto (05 / 2025)

ilmawi@utu.fi; aurber@utu.fi

In [1]:
import numpy as np
import pandas as pd
import sklearn
import os
import sys
import neuroCombat as nc

from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

print(sys.version)
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)

3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
1.26.4
2.3.0
1.7.0


In [None]:
# path to the directory containing the data

tabularData_root = r"/path/to/data" # USER.adapt!

In [3]:
# load the dataset
data_path = os.path.join(tabularData_root, "main_dataset_rsfMRI_FCH_LEiDA_demo.csv") 
data = pd.read_csv(data_path)
data

Unnamed: 0,src_subject_id,eventname,anthro_1_height_in,anthroweight1lb,demo_sex_v2,race_ethnicity,mri_info_visitid,mri_info_deviceserialnumber,interview_age,rel_family_id,...,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113,pubertal_developmental_scale,triponderal_mass_index,birth_weight_g,batch_id
0,NDAR_INV003RTV85,baseline_year_1_arm_1,56.5,93.0,2.0,1.0,S042_INV003RTV85_baseline,HASH96a0c182,131.0,8781,...,158.319426,326.542790,192.675931,171.474886,350.606622,228.556499,3.0,14.272570,3175.14659,S042
1,NDAR_INV00BD7VDC,baseline_year_1_arm_1,57.5,76.8,1.0,1.0,S090_INV00BD7VDC_baseline,HASH65b39280,112.0,3810,...,238.030667,506.216418,290.961745,209.787572,163.663905,348.097102,2.0,11.182072,3628.73896,S090
2,NDAR_INV00CY2MDM,baseline_year_1_arm_1,56.5,91.5,1.0,1.0,S021_INV00CY2MDM_baseline,HASHd422be27,130.0,5355,...,229.638996,228.446672,486.850276,488.304623,167.734859,563.542535,2.0,14.042368,2721.55422,S021
3,NDAR_INV00HEV6HB,baseline_year_1_arm_1,57.3,70.8,1.0,2.0,S012_INV00HEV6HB_baseline,HASHe4f6957a,124.0,2257,...,234.686758,269.924178,213.925214,245.301264,287.884664,591.824885,2.0,10.416792,2721.55422,S012
4,NDAR_INV00LH735Y,baseline_year_1_arm_1,52.0,80.0,1.0,3.0,S011_INV00LH735Y_baseline,HASH5b0cf1bb,109.0,6069,...,155.455954,280.664438,254.253108,186.141947,183.348564,204.713464,1.0,15.748694,3175.14659,S011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,NDAR_INVZZ6ZJ2KY,baseline_year_1_arm_1,57.0,111.0,2.0,1.0,S042_INVZZ6ZJ2KY_baseline,HASH96a0c182,124.0,9345,...,168.400396,229.004979,174.781912,303.759961,150.655009,233.486270,3.0,16.590635,3628.73896,S042
4449,NDAR_INVZZ81LEEV,baseline_year_1_arm_1,53.5,57.8,1.0,2.0,S076_INVZZ81LEEV_baseline,HASH03db707f,108.0,8433,...,323.998671,474.123361,239.067239,278.370679,824.045017,140.978122,2.0,10.447950,2721.55422,S076
4450,NDAR_INVZZJ3A7BK,baseline_year_1_arm_1,59.0,137.0,2.0,1.0,S042_INVZZJ3A7BK_baseline,HASH96a0c182,122.0,9346,...,178.583760,320.437889,171.179551,157.742490,192.919296,340.688716,3.0,18.464142,3628.73896,S042
4451,NDAR_INVZZLZCKAY,baseline_year_1_arm_1,59.5,123.0,2.0,1.0,S042_INVZZLZCKAY_baseline,HASH96a0c182,110.0,9347,...,161.903962,421.030194,252.015164,370.685204,228.116156,328.253479,3.0,16.162882,3175.14659,S042


In [4]:
# covariates
num_cov_cols = ["interview_age", "pubertal_developmental_scale", "triponderal_mass_index"] # numeric covariates
cat_cov_cols = ["demo_sex_v2", "race_ethnicity"] # categorical covariates

In [5]:
# rs-fMRI features
rsfmri_pattern = "^(rsfmri)" # define a pattern to catch all brain imaging features
rsfmri_cols = list(data.filter(regex=rsfmri_pattern).columns) # turn pattern to a list of column names

In [6]:
# import re

# LEiDA features
leida_pattern = "^(P_k|LT_k|TR_K)"
leida_cols = list(data.filter(regex=leida_pattern).columns)

In [7]:
# HARMONICS features
harmonics_pattern = "^(Harmonics)"
harmonics_cols = list(data.filter(regex=harmonics_pattern).columns)

In [8]:
## rs-fMRI harmonization
# imputer model for missing values
imputer = SimpleImputer(strategy="median")

features = imputer.fit_transform(data[rsfmri_cols])

# e.g. demo and ethnicity as covariates
covars = data[num_cov_cols + cat_cov_cols + ["batch_id"]]
categorical_cols = cat_cov_cols
continuous_cols = num_cov_cols
batch_col = "batch_id"

# transpose input data
features = features.T

data_combat = nc.neuroCombat(dat=features,
    covars=covars,

    batch_col=batch_col,
    categorical_cols=categorical_cols,
    continuous_cols=continuous_cols,

    eb=False, 
    parametric=True, 
    mean_only=False, 
    ref_batch=None)["data"]

data_rsfmri_harmonized = data_combat.T

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding L/S adjustments without Empirical Bayes
[neuroCombat] Final adjustment of data


In [9]:
np.isnan(data_combat).any()

False

In [10]:
import pandas as pd

# Create dataframe with names of original columns
data_rsfmri_harmonized = pd.DataFrame(
    data_rsfmri_harmonized,
    columns=rsfmri_cols,
    index=data.index
)
data_rsfmri_harmonized["src_subject_id"] = data["src_subject_id"].values
data_rsfmri_harmonized

Unnamed: 0,rsfmri_c_ngd_ad_ngd_ad,rsfmri_c_ngd_ad_ngd_cgc,rsfmri_c_ngd_ad_ngd_ca,rsfmri_c_ngd_ad_ngd_dt,rsfmri_c_ngd_ad_ngd_dla,rsfmri_c_ngd_ad_ngd_fo,rsfmri_c_ngd_ad_ngd_n,rsfmri_c_ngd_ad_ngd_rspltp,rsfmri_c_ngd_ad_ngd_sa,rsfmri_c_ngd_ad_ngd_smh,...,rsfmri_cor_ngd_vs_scs_hprh,rsfmri_cor_ngd_vs_scs_pllh,rsfmri_cor_ngd_vs_scs_plrh,rsfmri_cor_ngd_vs_scs_ptlh,rsfmri_cor_ngd_vs_scs_ptrh,rsfmri_cor_ngd_vs_scs_thplh,rsfmri_cor_ngd_vs_scs_thprh,rsfmri_cor_ngd_vs_scs_vtdclh,rsfmri_cor_ngd_vs_scs_vtdcrh,src_subject_id
0,0.457167,0.241479,-0.078296,-0.119625,0.023584,-0.033962,-0.061175,-0.054620,0.004115,0.192112,...,0.104002,0.011253,-0.108310,0.052413,0.105932,-0.150057,-0.066619,0.001615,0.101352,NDAR_INV003RTV85
1,0.232027,0.155129,-0.096984,-0.055408,-0.002533,-0.044779,-0.013374,-0.127971,0.080834,0.150156,...,0.079859,0.003752,0.055617,0.125971,0.019147,-0.046688,0.004899,-0.039778,-0.038543,NDAR_INV00BD7VDC
2,0.382290,0.182702,-0.145126,-0.061822,-0.050542,-0.127149,-0.020685,-0.029186,-0.026092,0.320763,...,-0.063577,-0.145793,-0.074672,0.054565,0.218420,-0.013031,-0.068786,-0.071914,-0.002755,NDAR_INV00CY2MDM
3,0.226547,0.187990,-0.077466,-0.061174,0.049958,0.006558,-0.009139,-0.089641,0.057503,0.116328,...,-0.014942,-0.017800,-0.022739,0.091560,0.063601,-0.068058,0.021340,-0.003984,-0.093801,NDAR_INV00HEV6HB
4,0.427065,0.102434,-0.164025,-0.053957,-0.053220,-0.085741,0.018350,0.013225,-0.090378,0.244666,...,-0.081375,0.054753,-0.095863,0.035250,-0.154294,0.017505,0.060892,0.016276,-0.260387,NDAR_INV00LH735Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,0.323764,0.210080,-0.141536,-0.058630,-0.028586,0.041546,-0.013603,-0.112605,0.069620,0.069825,...,0.061826,-0.060954,-0.037132,0.009244,0.089950,-0.007309,0.109443,0.045622,-0.069808,NDAR_INVZZ6ZJ2KY
4449,0.423402,0.214842,-0.037779,0.024255,-0.062967,0.018567,0.021773,0.008793,0.166203,-0.014783,...,0.107647,-0.049192,0.033706,-0.047667,0.193969,-0.008287,-0.064136,0.061139,0.001464,NDAR_INVZZ81LEEV
4450,0.551212,0.220227,-0.280502,-0.116360,-0.034828,-0.116768,-0.015944,0.056590,0.004113,0.239089,...,0.090404,-0.027061,-0.000111,0.239423,-0.004772,-0.092039,0.002249,-0.008852,-0.080285,NDAR_INVZZJ3A7BK
4451,0.304936,0.166057,-0.010896,-0.018170,-0.031671,-0.059340,0.025882,-0.050387,0.127491,0.095790,...,0.033272,0.051310,0.106973,0.125450,0.009328,-0.108542,0.023075,-0.084718,-0.011303,NDAR_INVZZLZCKAY


In [12]:
# save
data_rsfmri_harmonized.to_csv("dataset_rsfmri_harmonized.csv", index=False)

In [13]:
## LEiDA harmonization
# imputer model for missing values
imputer = SimpleImputer(strategy="median")

features = imputer.fit_transform(data[leida_cols])

# e.g. demo and ethnicity as covariates
covars = data[num_cov_cols + cat_cov_cols + ["batch_id"]]
categorical_cols = cat_cov_cols
continuous_cols = num_cov_cols
batch_col = "batch_id"

# transpose input data
features = features.T

data_combat = nc.neuroCombat(dat=features,
    covars=covars,

    batch_col=batch_col,
    categorical_cols=categorical_cols,
    continuous_cols=continuous_cols,

    eb=False, 
    parametric=True, 
    mean_only=False, 
    ref_batch=None)["data"]

data_leida_harmonized = data_combat.T

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding L/S adjustments without Empirical Bayes
[neuroCombat] Final adjustment of data


In [14]:
np.isnan(data_combat).any()

False

In [15]:
import pandas as pd

# Create dataframe with names of original columns
data_leida_harmonized = pd.DataFrame(
    data_leida_harmonized,
    columns=leida_cols,
    index=data.index
)
data_leida_harmonized["src_subject_id"] = data["src_subject_id"].values
data_leida_harmonized

Unnamed: 0,P_k2c1,P_k2c2,P_k3c1,P_k3c2,P_k3c3,P_k4c1,P_k4c2,P_k4c3,P_k4c4,P_k5c1,...,TR_K20_C20x12,TR_K20_C20x13,TR_K20_C20x14,TR_K20_C20x15,TR_K20_C20x16,TR_K20_C20x17,TR_K20_C20x18,TR_K20_C20x19,TR_K20_C20x20,src_subject_id
0,0.647172,0.352828,0.562533,0.346135,0.088519,0.374702,0.083642,0.441505,0.085085,0.350522,...,-0.002291,-0.003037,-0.005565,0.880777,-0.007731,-0.001261,-0.001492,0.000165,-0.044559,NDAR_INV003RTV85
1,0.598438,0.401562,0.517813,0.379745,0.115142,0.394156,0.201396,0.327036,0.072790,0.304260,...,-0.003160,0.003679,-0.000671,-0.000847,-0.007886,0.005555,0.001750,-0.001430,0.681265,NDAR_INV00BD7VDC
2,0.521013,0.478987,0.346593,0.451067,0.206493,0.267944,0.142255,0.386025,0.203140,0.237144,...,0.000803,0.000553,0.000043,-0.002681,-0.003296,-0.002412,-0.003751,-0.003269,0.882005,NDAR_INV00CY2MDM
3,0.574936,0.425064,0.487729,0.230851,0.289935,0.373868,0.300234,0.160502,0.165179,0.302215,...,-0.000560,0.005127,0.001714,-0.004005,-0.002163,-0.000265,-0.001367,-0.001254,0.028720,NDAR_INV00HEV6HB
4,0.537369,0.462631,0.393295,0.403082,0.205353,0.282729,0.221302,0.265033,0.228642,0.247733,...,-0.000604,0.001009,-0.002796,-0.003525,0.006983,-0.005582,-0.001689,0.003833,0.016836,NDAR_INV00LH735Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,0.406129,0.593871,0.309663,0.317567,0.374865,0.183703,0.238299,0.284686,0.295287,0.191498,...,-0.002589,-0.004681,0.107303,0.056160,-0.008438,-0.001241,-0.001558,0.000703,0.781903,NDAR_INVZZ6ZJ2KY
4449,0.492747,0.507253,0.390497,0.504162,0.097349,0.327398,0.284921,0.298328,0.083266,0.129863,...,0.092533,0.003654,0.114608,-0.001069,-0.000917,-0.004972,0.165557,-0.001961,0.473631,NDAR_INVZZ81LEEV
4450,0.589519,0.410481,0.479907,0.151125,0.369604,0.328838,0.174795,0.240993,0.250619,0.216742,...,-0.002695,-0.005042,-0.007438,0.001170,-0.008927,-0.001231,-0.001572,0.000457,-0.044700,NDAR_INVZZJ3A7BK
4451,0.487296,0.512704,0.301820,0.262970,0.437932,0.209052,0.218849,0.222956,0.350473,0.208914,...,-0.003096,0.101219,-0.011897,0.000791,-0.008658,-0.001219,-0.001711,0.003451,0.850299,NDAR_INVZZLZCKAY


In [16]:
# save
data_leida_harmonized.to_csv("dataset_leida_harmonized.csv", index=False)

In [11]:
## FCH harmonization
# imputer model for missing values
imputer = SimpleImputer(strategy="median")

features = imputer.fit_transform(data[harmonics_cols])

# e.g. demo and ethnicity as covariates
covars = data[num_cov_cols + cat_cov_cols + ["batch_id"]]
categorical_cols = cat_cov_cols
continuous_cols = num_cov_cols
batch_col = "batch_id"

# transpose input data
features = features.T

data_combat = nc.neuroCombat(dat=features,
    covars=covars,

    batch_col=batch_col,
    categorical_cols=categorical_cols,
    continuous_cols=continuous_cols,

    eb=False, 
    parametric=True, 
    mean_only=False, 
    ref_batch=None)["data"]

data_fch_harmonized = data_combat.T
data_fch_harmonized

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding L/S adjustments without Empirical Bayes
[neuroCombat] Final adjustment of data


array([[  3.10828575,   3.47949213,   1.91487857, ..., 174.94518843,
        380.79205114, 232.88218112],
       [  2.566994  ,   2.79961016,   1.69513899, ..., 204.1863836 ,
        162.89606975, 349.63603734],
       [  3.00874646,   2.81261421,   1.64932786, ..., 503.17570037,
        168.56371282, 580.83242142],
       ...,
       [  2.25710731,   3.06832344,   1.40163737, ..., 161.77969363,
        204.15120959, 335.03666587],
       [  2.56901513,   2.21384887,   1.49265074, ..., 368.14515546,
        242.80852945, 323.71314428],
       [  2.78618955,   2.52338701,   1.95183266, ..., 388.50598368,
        110.41786337, 243.94520278]])

In [12]:
np.isnan(data_combat).any()

False

In [13]:
import pandas as pd

# Create dataframe with names of original columns
data_fch_harmonized = pd.DataFrame(
    data_fch_harmonized,
    columns=harmonics_cols,
    index=data.index
)
data_fch_harmonized["src_subject_id"] = data["src_subject_id"].values
data_fch_harmonized

Unnamed: 0,Harmonics_power1,Harmonics_power2,Harmonics_power3,Harmonics_power4,Harmonics_power5,Harmonics_power6,Harmonics_power7,Harmonics_power8,Harmonics_power9,Harmonics_power10,...,Harmonics_energy105,Harmonics_energy106,Harmonics_energy107,Harmonics_energy108,Harmonics_energy109,Harmonics_energy110,Harmonics_energy111,Harmonics_energy112,Harmonics_energy113,src_subject_id
0,3.108286,3.479492,1.914879,1.960069,1.434098,1.762797,1.400450,1.205757,0.937912,1.341020,...,224.074842,270.635802,149.918747,159.906448,328.380006,187.422646,174.945188,380.792051,232.882181,NDAR_INV003RTV85
1,2.566994,2.799610,1.695139,2.020614,1.498308,1.963471,1.392052,1.204367,1.238820,0.905862,...,265.405805,203.827971,526.342366,240.469094,533.455116,294.448171,204.186384,162.896070,349.636037,NDAR_INV00BD7VDC
2,3.008746,2.812614,1.649328,1.799771,1.004009,2.297772,1.188784,1.615318,1.236740,1.089989,...,173.462449,184.516136,248.086995,230.721626,224.483155,479.057148,503.175700,168.563713,580.832421,NDAR_INV00CY2MDM
3,2.994816,3.684008,1.933276,1.872740,1.340740,1.353698,1.299522,1.011449,1.452345,0.873378,...,84.506488,155.708194,149.201409,228.160695,278.510313,227.256269,248.669168,296.606184,605.215230,NDAR_INV00HEV6HB
4,2.519016,2.829412,2.041698,2.781701,1.494378,1.522976,1.447296,1.398948,1.178128,0.889706,...,183.821034,215.331277,110.738939,156.302667,281.926626,256.659803,197.018614,183.662900,192.935310,NDAR_INV00LH735Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448,2.928370,2.740830,1.521654,2.121269,1.276500,1.812047,1.931804,1.513678,1.386830,1.187286,...,168.604974,434.276274,126.581943,170.081743,227.010070,169.987891,303.172569,156.701236,237.394494,NDAR_INVZZ6ZJ2KY
4449,2.263671,2.602810,1.913711,1.810228,1.465550,1.719780,1.817028,1.594127,1.347437,1.311856,...,183.064944,237.591258,115.185943,322.262749,466.824489,238.828473,271.495885,748.898893,137.803381,NDAR_INVZZ81LEEV
4450,2.257107,3.068323,1.401637,2.707580,1.328600,1.243200,1.720705,1.328670,1.311073,1.124740,...,250.086663,146.099634,216.054519,180.365018,321.999509,166.455681,161.779694,204.151210,335.036666,NDAR_INVZZJ3A7BK
4451,2.569015,2.213849,1.492651,2.339882,1.065527,1.627266,1.424724,1.447765,1.516035,1.134284,...,161.322324,347.493733,113.967876,163.506345,426.298279,245.146912,368.145155,242.808529,323.713144,NDAR_INVZZLZCKAY


In [14]:
# save
data_fch_harmonized.to_csv("dataset_fch_harmonized.csv", index=False)