# 01. Response: extra covariates

Reviewer 1 requested that we include mean functional connectivity and mean hemispheric differences as covariates to check it's a local rather than global effect. This is not necessarily recommended for future analyses. Potential collider's bias and similar to global signal regression.

Steps:
- get mean functional connectivity and mean hemispheric differences
- residualize phenotypes as preprocessing step for mostest
(Follow up with MOSTEST -> MATLAB)

Last update: 2024-06-26, JS Amelink

In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import quantile_transform

#### Set up functions

In [2]:
def get_mean_fc(sub):
    ## PER SUBJECT
    #load z-transformed correlation matrix
    aicha = np.loadtxt("/data/workspaces/lag/workspaces/lg-ukbiobank/projects/multilateral/FuncNet_AICHA/mats_AICHA/fcmatz/{0}_fcmatz.txt".format(sub))

    #set negative values and diagonals to zero
    aicha[aicha < 0] = 0
    np.fill_diagonal(aicha, 0)
    
    return np.mean(aicha)

def get_mean_hd(sub, l_inds, r_inds):
    ## PER SUBJECT
    #load z-transformed correlation matrix
    aicha = np.loadtxt("/data/workspaces/lag/workspaces/lg-ukbiobank/projects/multilateral/FuncNet_AICHA/mats_AICHA/fcmatz/{0}_fcmatz.txt".format(sub))

    #set negative values and diagonals to zero
    aicha[aicha < 0] = 0
    np.fill_diagonal(aicha, 0)
    
    #get half of the matrix
    l = aicha[np.ix_(l_inds, l_inds)]
    r = aicha[np.ix_(r_inds, r_inds)]
    
    #get 
    hd = l - r

    return np.mean(hd)

def get_all_mean_fc(sub_list):
    #get mean_fc
    mean_fc = [get_mean_fc(sub) for sub in sub_list]
    
    #turn into df
    df = pd.DataFrame([sub_list, mean_fc]).transpose()   
    df.columns = ['sid', 'mean_fc']
    
    return df.set_index(df.columns[0]) 

def get_all_mean_hd(sub_list, l_inds, r_inds):
    mean_hd = [get_mean_hd(sub, l_inds, r_inds) for sub in sub_list]
    
    #turn into df
    df = pd.DataFrame([sub_list, mean_hd]).transpose()   
    df.columns = ['sid', 'mean_hd']
    
    return df.set_index(df.columns[0]) 

def residualize_mean_fc(file_name, covs, exome_subs, drop_all=True):
    input_file = file_name + '.csv'
    data = pd.read_csv(input_file)
    data = data.set_index(data.columns[0]) 
    data = data.loc[exome_subs]

    if drop_all:
        data = data.dropna()
        data = data.sort_index()
        #data[data.isnull()] = 0
        missing_sub = list((set(list(covs.index.values)).difference(list(data.index.values))))
        print("No. subjects missing from subject file: ", len(missing_sub))
        covs=covs.loc[data.index.values]

        #define new dataframe
        data_new=pd.DataFrame(columns=data.columns, index=data.index.values)

        #residualize
        for dep_var in data.columns: 
            model = sm.OLS(data[dep_var], exog=covs)
            results = model.fit()
            data_new[dep_var] = results.resid
    
            
    #quantile transformation
    X = data_new.to_numpy()
    data_new2 = pd.DataFrame(data=quantile_transform(X, n_quantiles=1000, output_distribution='normal', random_state=0, copy=True), columns=data_new.columns, index=data_new.index.values)

    #reorder and save
    initial_cols = data_new.columns
    data_new['Subject_ID'] = data.index.values.astype(int)
    data_new['Family_ID'] = data.index.values.astype(int)
    data_new = data_new[['Subject_ID', 'Family_ID', *initial_cols]]
    data_new.to_csv(file_name[:-5] + '{0}_mean_fc_resid.txt'.format(len(data_new)), na_rep="NA", sep="\t", index=False, header=False)
    data_new2 = pd.concat([data_new[['Family_ID', 'Subject_ID']], data_new2], axis=1)
    data_new2.to_csv(file_name[:-5] + '{0}_mean_fc_resid_norm.txt'.format(len(data_new2)), na_rep="NA", sep="\t", index=False, header=False)
    list_w = list(data_new2.columns)
    with open(file_name[:-5]+"_mean_fc_resid_col_names.txt", "w") as file:
        for row in list_w:
            file.write(str(row)+'\n')

def residualize_mean_hd(file_name, covs, exome_subs, drop_all=True):
    input_file = file_name + '.csv'
    data = pd.read_csv(input_file)
    data = data.set_index(data.columns[0]) 
    data = data.loc[exome_subs]

    if drop_all:
        data = data.dropna()
        data = data.sort_index()
        #data[data.isnull()] = 0
        missing_sub = list((set(list(covs.index.values)).difference(list(data.index.values))))
        print("No. subjects missing from subject file: ", len(missing_sub))
        covs=covs.loc[data.index.values]

        #define new dataframe
        data_new=pd.DataFrame(columns=data.columns, index=data.index.values)

        #residualize
        for dep_var in data.columns: 
            model = sm.OLS(data[dep_var], exog=covs)
            results = model.fit()
            data_new[dep_var] = results.resid
    
            
    #quantile transformation
    X = data_new.to_numpy()
    data_new2 = pd.DataFrame(data=quantile_transform(X, n_quantiles=1000, output_distribution='normal', random_state=0, copy=True), columns=data_new.columns, index=data_new.index.values)

    #reorder and save
    initial_cols = data_new.columns
    data_new['Subject_ID'] = data.index.values.astype(int)
    data_new['Family_ID'] = data.index.values.astype(int)
    data_new = data_new[['Subject_ID', 'Family_ID', *initial_cols]]
    data_new.to_csv(file_name[:-5] + '{0}_mean_hd_resid.txt'.format(len(data_new)), na_rep="NA", sep="\t", index=False, header=False)
    data_new2 = pd.concat([data_new[['Family_ID', 'Subject_ID']], data_new2], axis=1)
    data_new2.to_csv(file_name[:-5] + '{0}_mean_hd_resid_norm.txt'.format(len(data_new2)), na_rep="NA", sep="\t", index=False, header=False)
    list_w = list(data_new2.columns)
    with open(file_name[:-5]+"_mean_hd_resid_col_names.txt", "w") as file:
        for row in list_w:
            file.write(str(row)+'\n')

#### run analysis

In [3]:
l_inds = np.arange(0, 384, 2)
r_inds = np.arange(1, 384, 2)
print(l_inds)
print(r_inds)

[  0   2   4   6   8  10  12  14  16  18  20  22  24  26  28  30  32  34
  36  38  40  42  44  46  48  50  52  54  56  58  60  62  64  66  68  70
  72  74  76  78  80  82  84  86  88  90  92  94  96  98 100 102 104 106
 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142
 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178
 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214
 216 218 220 222 224 226 228 230 232 234 236 238 240 242 244 246 248 250
 252 254 256 258 260 262 264 266 268 270 272 274 276 278 280 282 284 286
 288 290 292 294 296 298 300 302 304 306 308 310 312 314 316 318 320 322
 324 326 328 330 332 334 336 338 340 342 344 346 348 350 352 354 356 358
 360 362 364 366 368 370 372 374 376 378 380 382]
[  1   3   5   7   9  11  13  15  17  19  21  23  25  27  29  31  33  35
  37  39  41  43  45  47  49  51  53  55  57  59  61  63  65  67  69  71
  73  75  77  79  81  83  85  87  89  91  93  95  97  99 101 103 105 107
 

In [4]:
base_path = "/data/clusterfs/lag/users/jitame/SENT_CORE/"

exome_sub_list = os.path.join(base_path, "subj_sent_N30652_exome_final_pass_sex.txt")
exome_subs = [int(x) for x in open( exome_sub_list ).read().split('\n')[:-1] ]

mean_fc = get_all_mean_fc(exome_subs)
mean_fc.to_csv("/data/clusterfs/lag/users/jitame/SENT_CORE/covars/mean_fc.txt", sep="\t")

mean_hd = get_all_mean_hd(exome_subs, l_inds, r_inds)
mean_hd.to_csv("/data/clusterfs/lag/users/jitame/SENT_CORE/covars/mean_hd.txt", sep="\t")

In [5]:
print("Load covariates mean FC")
#set up covariates and impute data
covs = pd.read_csv("/data/clusterfs/lag/users/jitame/SENT_CORE/covars/covars_pc10_for_correction_N30660_gwas_batch.txt", sep="\t")
covs = covs.set_index('subject_id')
covs = covs.dropna()
covs = covs.loc[exome_subs]
covs.join(mean_fc)


Load covariates mean FC


Unnamed: 0_level_0,sex,MR_head_motion_rs,MR_inv_tSNR_rs,MR_X_brain_pos,MR_Y_brain_pos,MR_Z_brain_pos,MR_table_pos,geno_PC_1,geno_PC_2,geno_PC_3,...,geno_PC_9,geno_PC_10,age,age_sq,age_sex,geno_array_dummy,site_dummy_11025,site_dummy_11026,site_dummy_11027,mean_fc
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000099.0,0,0.074204,0.028055,0.677210,62,11.59960,-1094,-14.0565,7.54184,-1.347950,...,6.880050,0.810708,64.750000,4192.562500,0.000000,1,1,0,0,0.138676
1000188.0,0,0.129244,0.027491,-2.758590,69,-64.41920,-1042,-12.1885,3.44927,-1.990930,...,1.256050,0.179602,75.083333,5637.506944,0.000000,1,0,1,0,0.192914
1000192.0,0,0.148394,0.029136,-1.934930,58,18.24810,-1102,-13.6554,4.80100,0.342491,...,10.145500,4.374790,63.250000,4000.562500,0.000000,1,1,0,0,0.195842
1000256.0,1,0.082996,0.027711,3.506090,69,-37.46600,-1042,-12.7615,3.41320,-0.328056,...,-0.559186,5.274770,50.916667,2592.506944,50.916667,1,0,0,1,0.166414
1000281.0,0,0.058310,0.026778,-1.862420,58,11.83490,-1084,-16.9820,5.47485,-5.504310,...,1.389980,2.973050,54.166667,2934.027778,0.000000,1,1,0,0,0.237258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025193.0,1,0.099184,0.029971,5.821390,64,4.28147,-1092,-10.4228,1.01482,-1.810400,...,-0.066395,-0.085145,69.583333,4841.840278,69.583333,1,1,0,0,0.096916
6025262.0,0,0.094660,0.027402,1.317250,70,-40.62810,-1042,-14.2012,2.80431,-4.491370,...,-1.170820,2.862540,74.916667,5612.506944,0.000000,1,1,0,0,0.386536
6025779.0,0,0.098519,0.027998,0.017461,57,-25.56760,-1042,-12.1130,6.60868,-1.777330,...,-1.181510,-2.822330,52.500000,2756.250000,0.000000,1,1,0,0,0.137810
6026006.0,1,0.092922,0.026725,3.767400,61,-61.19970,-1042,-13.7673,4.61137,-4.409260,...,0.083864,-0.847808,70.666667,4993.777778,70.666667,1,1,0,0,0.132773


In [6]:
print("Load covariates mean HD")
#set up covariates and impute data
covs_hd = pd.read_csv("/data/clusterfs/lag/users/jitame/SENT_CORE/covars/covars_pc10_for_correction_N30660_gwas_batch.txt", sep="\t")
covs_hd = covs_hd.set_index('subject_id')
covs_hd = covs_hd.dropna()
covs_hd = covs_hd.loc[exome_subs]
covs_hd.join(mean_hd)


Load covariates mean HD


Unnamed: 0_level_0,sex,MR_head_motion_rs,MR_inv_tSNR_rs,MR_X_brain_pos,MR_Y_brain_pos,MR_Z_brain_pos,MR_table_pos,geno_PC_1,geno_PC_2,geno_PC_3,...,geno_PC_9,geno_PC_10,age,age_sq,age_sex,geno_array_dummy,site_dummy_11025,site_dummy_11026,site_dummy_11027,mean_hd
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000099.0,0,0.074204,0.028055,0.677210,62,11.59960,-1094,-14.0565,7.54184,-1.347950,...,6.880050,0.810708,64.750000,4192.562500,0.000000,1,1,0,0,-0.015433
1000188.0,0,0.129244,0.027491,-2.758590,69,-64.41920,-1042,-12.1885,3.44927,-1.990930,...,1.256050,0.179602,75.083333,5637.506944,0.000000,1,0,1,0,-0.005265
1000192.0,0,0.148394,0.029136,-1.934930,58,18.24810,-1102,-13.6554,4.80100,0.342491,...,10.145500,4.374790,63.250000,4000.562500,0.000000,1,1,0,0,-0.001133
1000256.0,1,0.082996,0.027711,3.506090,69,-37.46600,-1042,-12.7615,3.41320,-0.328056,...,-0.559186,5.274770,50.916667,2592.506944,50.916667,1,0,0,1,0.004486
1000281.0,0,0.058310,0.026778,-1.862420,58,11.83490,-1084,-16.9820,5.47485,-5.504310,...,1.389980,2.973050,54.166667,2934.027778,0.000000,1,1,0,0,-0.014109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6025193.0,1,0.099184,0.029971,5.821390,64,4.28147,-1092,-10.4228,1.01482,-1.810400,...,-0.066395,-0.085145,69.583333,4841.840278,69.583333,1,1,0,0,0.008939
6025262.0,0,0.094660,0.027402,1.317250,70,-40.62810,-1042,-14.2012,2.80431,-4.491370,...,-1.170820,2.862540,74.916667,5612.506944,0.000000,1,1,0,0,0.007021
6025779.0,0,0.098519,0.027998,0.017461,57,-25.56760,-1042,-12.1130,6.60868,-1.777330,...,-1.181510,-2.822330,52.500000,2756.250000,0.000000,1,1,0,0,-0.018931
6026006.0,1,0.092922,0.026725,3.767400,61,-61.19970,-1042,-13.7673,4.61137,-4.409260,...,0.083864,-0.847808,70.666667,4993.777778,70.666667,1,1,0,0,-0.008323


In [7]:
subs_no = 29682
fn_fc = os.path.join(base_path, "pheno", "sent_edges_N{0}".format(subs_no))
fn_hd = os.path.join(base_path, "pheno", "sent_edges_asym_N{0}".format(subs_no))

residualize_mean_fc(fn_fc, covs, exome_subs, drop_all=True)
residualize_mean_hd(fn_hd, covs_hd, exome_subs, drop_all=True)


No. subjects missing from subject file:  0
No. subjects missing from subject file:  0
