In [7]:
import pandas as pd
import numpy as np
from neuroHarmonize import harmonizationLearn, harmonizationApply

# set data path
data_path = "/volume/projects/CV_PHENOM/Data/"
data_path_MUSE = "/volume/projects/CV_PHENOM/Data/MUSE/"
data_path_GL = "/volume/projects/GL_Tesis/Analysis/site_harmonization/"

# read in unharmonized data
data = pd.read_csv(data_path + "muse_rois_demo.csv")


In [8]:
data = data[data["Diagnosis"] != "n.a."]

In [9]:
data["Diagnosis_detailed"] = data["Diagnosis"]
data["Diagnosis"] = data["Diagnosis_detailed"] != "HC"
data["Sex0M1F"] = data["Sex"] - 1
data = data[(data["Age"] >= 16) & (data["Age"] <= 50)]

In [10]:
# split data by time point
t0idx_df = pd.read_csv(data_path_MUSE + "t0_idx.csv")
t1idx_df = pd.read_csv(data_path_MUSE + "t1_idx.csv")
t0idx = t0idx_df["BOGEN_ID"].values
t1idx = t1idx_df["BOGEN_ID"].values
data_t0 = data[data["ID"].isin(t0idx)]
data_t1 = data[data["ID"].isin(t1idx)]
data_t0.to_csv(data_path_GL + "data_t0_filtered.csv", index=False)
data_t1.to_csv(data_path_GL + "data_t1_filtered.csv", index=False)

In [11]:
controls_data_t0 = data_t0[data_t0["Diagnosis_detailed"] == "HC"]
controls_rois_t0 = controls_data_t0.iloc[:, 2:260].values
controls_covars_t0 = controls_data_t0[["SITE", "Age", "DLICV_baseline", "Sex0M1F"]]

patients_data_t0 = data_t0[data_t0["Diagnosis_detailed"] != "HC"]
patients_rois_t0 = patients_data_t0.iloc[:, 2:260].values
patients_covars_t0 = patients_data_t0[["SITE", "Age", "DLICV_baseline", "Sex0M1F"]]

### neuroHarmonizelearn ###

# first, compute model on control subjects only
controls_t0_model, controls_hdata_t0 = harmonizationLearn(controls_rois_t0, 
                                                controls_covars_t0)
# then, use the estimates to harmonize patient subjects 
patients_hdata_t0 = harmonizationApply(patients_rois_t0, patients_covars_t0, controls_t0_model)

In [13]:
controls_data_t0.head()

Unnamed: 0,ID,DLICV_baseline,MUSE_Volume_701,MUSE_Volume_601,MUSE_Volume_604,MUSE_Volume_606,MUSE_Volume_607,MUSE_Volume_613,MUSE_Volume_614,MUSE_Volume_501,...,PSN,SITE,SITE_idx,Age,ICV,Sex,Diagnosis,Diagnosis_idx,Diagnosis_detailed,Sex0M1F
1,100427,1326045.0,1175882.0,644779.355164,495010.824639,318607.715006,241833.786825,317324.502048,242579.105037,10597.932777,...,401319,Uni Udine,2,28.408219,1.031461,2,False,1,HC,1
5,100742,1413257.0,1263687.0,708495.057268,525189.112927,352342.953128,255852.077922,346256.573163,257212.614555,12124.420449,...,401317,Uni Udine,2,23.934247,1.048959,2,False,1,HC,1
7,100861,1674748.0,1508261.0,838630.714577,630264.407685,412581.026083,305750.635538,415163.769588,312134.001598,12379.77055,...,401304,UBS,4,30.375342,2.902251,1,False,1,HC,0
10,100963,1607498.0,1445671.0,854801.969298,558203.005438,419888.074763,272330.485185,424408.418866,274528.166305,11344.353948,...,401301,UBS,4,16.583562,2.166256,1,False,1,HC,0
16,101712,1495299.0,1338417.0,787640.911058,520871.446542,388987.423807,253332.950793,388406.475641,256767.664008,10770.831741,...,401253,Uni Udine,2,21.117808,1.130877,1,False,1,HC,0


In [15]:
patients_data_t0.iloc[:, 2:260]

Unnamed: 0,MUSE_Volume_701,MUSE_Volume_601,MUSE_Volume_604,MUSE_Volume_606,MUSE_Volume_607,MUSE_Volume_613,MUSE_Volume_614,MUSE_Volume_501,MUSE_Volume_502,MUSE_Volume_503,...,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207
2,1.046864e+06,597636.769217,419648.179731,291334.115284,204040.930667,297009.169835,203946.888412,11660.360652,124428.449430,53004.675526,...,7383.635324,8012.927419,5679.449049,5139.804710,8970.927955,7090.082866,2841.482324,3112.183393,1238.369500,1565.320142
3,1.154405e+06,678182.641803,445874.543028,333283.991964,217609.382295,336631.720443,218361.720330,9903.440403,121451.616571,50498.053940,...,8081.481586,9171.317058,6769.284521,7234.222396,8549.056159,8225.621116,3672.042421,3715.108500,1591.687129,2034.652514
4,1.300028e+06,770792.398584,469722.439810,379775.948283,232256.884646,380945.604788,225608.853002,11856.702162,162770.485437,65739.565204,...,10211.316671,9999.673460,8766.336698,10132.652823,8154.818926,7962.841678,3345.086496,4166.374529,1702.510428,1987.198640
6,1.252268e+06,715906.399302,493362.581947,355320.054762,240817.580697,352600.761462,237712.492342,14832.508907,124864.629593,61047.385461,...,11090.671433,11087.862246,7076.342791,6695.229715,7864.788025,7354.452333,3533.021219,4477.844545,1553.480573,2214.575983
8,1.052672e+06,601363.349586,424686.874474,295178.720642,207162.451149,297629.477547,207150.452339,10373.970985,121329.965042,53759.667473,...,8398.166969,8501.156754,6393.365829,4885.515396,8899.117279,7942.212196,3515.651276,4602.543465,951.905579,1499.851227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389,1.383496e+06,814677.453674,535131.248985,402575.490112,259841.362251,400937.752548,263940.705360,11349.181374,167847.103683,60034.379913,...,12070.065857,11537.151253,8792.591049,10463.323326,12485.999207,10899.253471,4457.285751,4490.280464,1838.705360,1791.712891
2391,1.088036e+06,593554.766942,452231.846640,289886.494621,214564.776488,294557.521812,224762.748843,12904.321309,121648.980796,55509.183594,...,8040.384816,8476.771879,5754.503225,5592.496869,7894.298159,7290.286023,2096.718100,2778.455829,1422.471993,1087.221846
2397,1.263962e+06,742822.599251,490477.541123,366681.977580,238589.507239,365605.333925,240367.507332,11520.526552,158827.350835,59806.016956,...,9351.419172,7931.128441,6879.972677,8085.813570,10969.460779,11146.118228,2780.816754,4540.360098,1817.550268,1922.138508
2399,1.132847e+06,641574.918419,445426.986880,314792.801596,216838.700885,317962.670536,216523.118660,12065.167335,128262.168142,61391.511963,...,9987.194165,8253.833040,7151.636366,6714.316190,7281.802328,7325.815279,3279.433096,3932.135799,582.469271,1370.020165


In [18]:
controls_data_t0.iloc[:, 2:260]

Unnamed: 0,MUSE_Volume_701,MUSE_Volume_601,MUSE_Volume_604,MUSE_Volume_606,MUSE_Volume_607,MUSE_Volume_613,MUSE_Volume_614,MUSE_Volume_501,MUSE_Volume_502,MUSE_Volume_503,...,MUSE_Volume_198,MUSE_Volume_199,MUSE_Volume_200,MUSE_Volume_201,MUSE_Volume_202,MUSE_Volume_203,MUSE_Volume_204,MUSE_Volume_205,MUSE_Volume_206,MUSE_Volume_207
1,1.175882e+06,644779.355164,495010.824639,318607.715006,241833.786825,317324.502048,242579.105037,10597.932777,133167.622059,56416.545615,...,6817.728027,8141.370983,5833.345484,7058.550185,8782.098549,8860.321805,3578.933676,3106.957403,1168.954270,1325.400782
5,1.263687e+06,708495.057268,525189.112927,352342.953128,255852.077922,346256.573163,257212.614555,12124.420449,172309.152123,58030.227227,...,7766.836065,8549.056739,6485.400398,7391.545921,9756.664836,9696.899661,4426.138556,5555.524586,1764.830465,2235.920669
7,1.508261e+06,838630.714577,630264.407685,412581.026083,305750.635538,415163.769588,312134.001598,12379.770550,186748.453787,67834.263298,...,11360.871738,10214.985538,9702.036479,8374.168351,11452.862602,11779.830131,5035.499918,4699.533284,2189.782530,2692.732581
10,1.445671e+06,854801.969298,558203.005438,419888.074763,272330.485185,424408.418866,274528.166305,11344.353948,180259.844521,65395.511193,...,14048.961513,12993.114715,8367.785845,8676.741016,12854.134881,11625.313182,4014.417513,4077.408373,1699.753368,1840.732912
16,1.338417e+06,787640.911058,520871.446542,388987.423807,253332.950793,388406.475641,256767.664008,10770.831741,157172.406390,59743.619621,...,11059.987485,11772.769578,8598.208643,7994.409263,11959.094860,10789.288491,3311.668217,3865.370706,1671.654180,1926.533103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,1.095625e+06,654878.041359,411196.212639,324163.845551,202071.956085,322165.043816,199177.243217,9947.013336,124331.667297,54167.627007,...,7869.219437,9117.095657,7520.254051,6650.340340,10619.946587,13002.710236,3165.685989,3126.689857,1020.898735,770.923531
2370,1.126317e+06,634727.158742,454855.383896,313130.531810,223879.071234,313707.470145,221015.377313,9960.935350,143839.626054,52964.339036,...,8731.066801,8187.124939,6382.317842,6777.275627,7652.182115,7181.232452,4043.567813,3141.664211,1079.884579,1212.870365
2390,1.360885e+06,776752.107958,513257.136689,384994.048417,251592.913234,383204.608884,250529.444749,11134.778707,134369.682514,63864.358665,...,12806.445697,12584.084105,7684.218978,8442.709311,11707.821229,12669.337363,3398.704654,3713.350701,1724.400965,1565.320142
2394,1.064409e+06,589826.963469,446264.544093,289729.719348,217465.031822,291333.645827,215341.129178,13458.383094,124124.310381,53589.543559,...,7657.648988,6475.703166,5711.738185,6600.697437,7392.661135,6917.682907,3045.860384,2897.867167,1526.930008,1468.932667


In [19]:
data_t0.shape

(1436, 270)

In [20]:
roinames = [colname + "neurocombat" for colname in controls_data_t0.iloc[:, 2:260].columns]

In [22]:
hdata_controls_df = pd.DataFrame(controls_hdata_t0, columns = [colname + "_neuroHarmonize" for colname in controls_data_t0.iloc[:, 2:260].columns])
hdata_controls_df["ID"] = controls_data_t0.ID.values

In [23]:
hdata_controls_df

Unnamed: 0,MUSE_Volume_701_neuroHarmonize,MUSE_Volume_601_neuroHarmonize,MUSE_Volume_604_neuroHarmonize,MUSE_Volume_606_neuroHarmonize,MUSE_Volume_607_neuroHarmonize,MUSE_Volume_613_neuroHarmonize,MUSE_Volume_614_neuroHarmonize,MUSE_Volume_501_neuroHarmonize,MUSE_Volume_502_neuroHarmonize,MUSE_Volume_503_neuroHarmonize,...,MUSE_Volume_199_neuroHarmonize,MUSE_Volume_200_neuroHarmonize,MUSE_Volume_201_neuroHarmonize,MUSE_Volume_202_neuroHarmonize,MUSE_Volume_203_neuroHarmonize,MUSE_Volume_204_neuroHarmonize,MUSE_Volume_205_neuroHarmonize,MUSE_Volume_206_neuroHarmonize,MUSE_Volume_207_neuroHarmonize,ID
0,1.134341e+06,631167.484465,464659.406398,312093.590887,227027.441050,310490.079201,226986.504554,10626.699239,127232.502211,57907.754487,...,8510.078321,5603.272433,6958.758332,7951.825640,8165.389357,3456.585434,2730.869454,1117.909658,1298.385779,100427
1,1.222849e+06,694956.509578,494825.453684,345818.363519,241032.872676,339444.487093,241635.455041,12171.574727,167486.031437,59258.645977,...,8931.696580,6256.756451,7270.107141,8926.336696,8988.773011,4311.004963,5156.539135,1716.895225,2210.340718,100742
2,1.458366e+06,820431.561996,595392.557273,403556.297119,288958.368442,406019.252486,293613.404009,12702.653155,179256.978828,68469.657516,...,10253.315258,9285.529947,8013.128502,11112.614835,11291.892626,4728.331840,4631.517245,2206.557209,2694.924888,100861
3,1.395021e+06,836333.878866,524476.464961,410691.491630,255984.833998,415172.260032,256808.669512,11668.905268,172883.481115,66079.371216,...,12783.103262,7962.664866,8312.895361,12470.112603,11132.290177,3802.660817,4067.351856,1737.593084,1890.478543,100963
4,1.297730e+06,774271.757093,490572.326161,382448.449800,238618.487929,381711.431014,241119.220706,10779.896573,151511.067873,60782.451612,...,12054.210009,8450.392757,7874.966960,11128.530040,10087.124495,3181.244087,3486.917922,1622.865004,1900.121344,101712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,1.113717e+06,661783.157474,424220.711455,328304.455523,208724.859300,324898.012325,205845.725297,9686.163523,130194.551412,53707.722880,...,9559.419933,7502.508241,6986.704131,9769.384935,12310.619565,3012.112705,3098.622373,1029.753591,745.088693,9532
453,1.075012e+06,616997.248165,420555.075026,304415.683199,207156.064738,304703.383310,203155.040797,10273.480814,136654.728790,53664.523087,...,8167.077522,5974.847879,6409.631051,7347.068259,6665.491107,3758.587790,3122.564070,1128.922069,1266.933629,9688
454,1.321452e+06,763302.497796,482987.311010,378461.673469,236907.626989,376465.780723,234825.472567,11149.840203,127916.479518,65562.554507,...,12832.459896,7483.367851,8361.729039,10877.311843,11818.197155,3269.210556,3336.670809,1675.891383,1538.216617,9905
455,1.082870e+06,597129.429673,459400.265140,294502.341173,224033.643490,293719.097231,222256.519432,12998.293226,129837.531861,53127.617347,...,6766.157716,5730.351714,6935.513650,7026.819891,6888.865250,2893.856128,2876.428627,1568.983496,1523.409412,99669


In [24]:
hdata_patients_df = pd.DataFrame(patients_hdata_t0, columns = [colname + "_neuroHarmonize" for colname in controls_data_t0.iloc[:, 2:260].columns])
hdata_patients_df["ID"] = patients_data_t0.ID.values

In [25]:
hdata_t0 = pd.concat([hdata_controls_df, hdata_patients_df])

In [26]:
hdata_t0

Unnamed: 0,MUSE_Volume_701_neuroHarmonize,MUSE_Volume_601_neuroHarmonize,MUSE_Volume_604_neuroHarmonize,MUSE_Volume_606_neuroHarmonize,MUSE_Volume_607_neuroHarmonize,MUSE_Volume_613_neuroHarmonize,MUSE_Volume_614_neuroHarmonize,MUSE_Volume_501_neuroHarmonize,MUSE_Volume_502_neuroHarmonize,MUSE_Volume_503_neuroHarmonize,...,MUSE_Volume_199_neuroHarmonize,MUSE_Volume_200_neuroHarmonize,MUSE_Volume_201_neuroHarmonize,MUSE_Volume_202_neuroHarmonize,MUSE_Volume_203_neuroHarmonize,MUSE_Volume_204_neuroHarmonize,MUSE_Volume_205_neuroHarmonize,MUSE_Volume_206_neuroHarmonize,MUSE_Volume_207_neuroHarmonize,ID
0,1.134341e+06,631167.484465,464659.406398,312093.590887,227027.441050,310490.079201,226986.504554,10626.699239,127232.502211,57907.754487,...,8510.078321,5603.272433,6958.758332,7951.825640,8165.389357,3456.585434,2730.869454,1117.909658,1298.385779,100427
1,1.222849e+06,694956.509578,494825.453684,345818.363519,241032.872676,339444.487093,241635.455041,12171.574727,167486.031437,59258.645977,...,8931.696580,6256.756451,7270.107141,8926.336696,8988.773011,4311.004963,5156.539135,1716.895225,2210.340718,100742
2,1.458366e+06,820431.561996,595392.557273,403556.297119,288958.368442,406019.252486,293613.404009,12702.653155,179256.978828,68469.657516,...,10253.315258,9285.529947,8013.128502,11112.614835,11291.892626,4728.331840,4631.517245,2206.557209,2694.924888,100861
3,1.395021e+06,836333.878866,524476.464961,410691.491630,255984.833998,415172.260032,256808.669512,11668.905268,172883.481115,66079.371216,...,12783.103262,7962.664866,8312.895361,12470.112603,11132.290177,3802.660817,4067.351856,1737.593084,1890.478543,100963
4,1.297730e+06,774271.757093,490572.326161,382448.449800,238618.487929,381711.431014,241119.220706,10779.896573,151511.067873,60782.451612,...,12054.210009,8450.392757,7874.966960,11128.530040,10087.124495,3181.244087,3486.917922,1622.865004,1900.121344,101712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,1.333362e+06,796239.602134,501092.637314,393314.124677,243407.409395,391756.098556,245998.379595,11667.535253,160626.592935,60893.557024,...,11411.302675,8380.646121,10014.692485,12097.052519,10402.651005,4190.074698,4434.229766,1863.190561,1836.675050,99035
975,1.146346e+06,621528.706406,484034.739836,303393.862857,230058.417309,308768.913488,241383.005038,12617.366564,128761.146280,55092.963921,...,8196.352250,6277.633796,5940.330121,9421.355338,8447.103208,2353.296149,2994.706617,1465.929413,1101.988976,99095
976,1.221237e+06,729266.180855,460178.113967,360162.862486,223873.254523,358801.152517,224709.071663,11556.618660,153312.166607,61154.518166,...,8367.267893,6642.658493,7981.294250,10139.055734,10405.762157,2644.695068,4154.189808,1770.247133,1895.881602,9988
977,1.195990e+06,666592.726632,477549.008542,326933.374899,232351.969773,330623.352735,233345.817456,11826.667711,135334.503460,60953.677628,...,7908.643426,7672.513512,7058.912729,8526.278602,8398.550821,3582.385749,4155.171301,625.720562,1399.084964,99986


In [27]:
hdata_t0.to_csv(data_path_GL + "pronia_muse_t0_harmonized_combat.csv", index = False)

In [33]:
all_df =  pd.merge(hdata_t0, data, on="ID")


In [34]:
all_df.to_csv(data_path_GL + "pronia_muse_t0_original_and_harmonized_combat.csv", index = False)

In [35]:
for col in all_df.columns:
    print(col)

MUSE_Volume_701_neuroHarmonize
MUSE_Volume_601_neuroHarmonize
MUSE_Volume_604_neuroHarmonize
MUSE_Volume_606_neuroHarmonize
MUSE_Volume_607_neuroHarmonize
MUSE_Volume_613_neuroHarmonize
MUSE_Volume_614_neuroHarmonize
MUSE_Volume_501_neuroHarmonize
MUSE_Volume_502_neuroHarmonize
MUSE_Volume_503_neuroHarmonize
MUSE_Volume_504_neuroHarmonize
MUSE_Volume_505_neuroHarmonize
MUSE_Volume_506_neuroHarmonize
MUSE_Volume_507_neuroHarmonize
MUSE_Volume_508_neuroHarmonize
MUSE_Volume_509_neuroHarmonize
MUSE_Volume_510_neuroHarmonize
MUSE_Volume_511_neuroHarmonize
MUSE_Volume_512_neuroHarmonize
MUSE_Volume_513_neuroHarmonize
MUSE_Volume_514_neuroHarmonize
MUSE_Volume_515_neuroHarmonize
MUSE_Volume_516_neuroHarmonize
MUSE_Volume_517_neuroHarmonize
MUSE_Volume_518_neuroHarmonize
MUSE_Volume_519_neuroHarmonize
MUSE_Volume_520_neuroHarmonize
MUSE_Volume_521_neuroHarmonize
MUSE_Volume_522_neuroHarmonize
MUSE_Volume_523_neuroHarmonize
MUSE_Volume_524_neuroHarmonize
MUSE_Volume_525_neuroHarmonize
MUSE_Vol