In [1]:
import sys
sys.path.append('../../')

In [2]:
preprocessed_dataset_path = 'preprocessed_dataset/'

In [3]:
from codes.docs.analysis import data_preprocessing, data_exploration, visualisation, data_preprocessing_high_dimension
from codes.docs.analysis.gene_analysis import genetic_file_preprocess

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
%matplotlib inline
import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as sfm

from scipy.stats import ttest_ind, pearsonr,zscore
import pandas as pd
import re

Here, the volumetric dataset is preprocessed to group the same brain regions.\
Then mass univariate test is preformed, across multiple regions.

# Main analysis

In [5]:
european_volumetric_dataset_term = pd.read_csv(
    preprocessed_dataset_path +
    'imaging_data/volumetric/term/european_volumetric_df.csv')
# mixed_volumetric_dataset_term = pd.read_csv(
#     preprocessed_dataset_path +
#     'imaging_data/volumetric/term/mixed_volumetric_df.csv')

In [6]:
dependentVar = [
    i for i in european_volumetric_dataset_term.columns
    if bool(re.search('Imperial \d', i)) or bool(re.search('AAL \d',i))
]
df = european_volumetric_dataset_term.copy()
df = data_preprocessing.Volumes.Group_Imperial_volumes(df,grouping=None)
df = data_exploration.MassUnivariate.remove_outliers(df,
                                                    col=dependentVar,
                                                     threshold=3,
                                                    remove_schemes='percentage',
                                                    percentage_of_outlier=0.1)
df = df.rename(columns = {'AAL 93':'AAL 109'})

In [7]:
# imputed_PRS = genetic_file_preprocess.Cohort.preprocess_PRSice_PRS_Anc_files('../../dataset/PRS/asd/imputed_data/ASD_HPC_merged_0_8_maf_geno_hwe_EUROPEANS.gwas.all.score',column_prefix='PRS_',column_suffix='_ASD_AllLDimputed')

PRS_ASD_EuroLD=genetic_file_preprocess.Cohort.preprocess_PRSice_PRS_Anc_files('../../dataset/PRS/asd/ASD_eurold_lifted37_dHCP_merged_cleaned_EUROPEANS.gwas.all.score',column_prefix = 'PRS_',column_suffix='_ASD_Euro_LD')
# PRS_SCZ_EuroLD = genetic_file_preprocess.Cohort.preprocess_PRSice_PRS_Anc_files('../../dataset/PRS/scz/SCZ_eur_ld_lifted37_dHCP_merged_cleaned_EUROPEANS.gwas.all.score',column_prefix = 'PRS_',column_suffix='_SCZ_Euro_LD')
# PRS_ASD_EuroLD_imputed=genetic_file_preprocess.Cohort.preprocess_PRSice_PRS_Anc_files('../../dataset/PRS/asd/ASD_eurold_HPC_merged_0_8_maf_geno_hwe_EUROPEANS.gwas.all.score',column_prefix = 'PRS_',column_suffix='_ASD_EuroLDimputed')

df = df.rename(columns={i:i+'_ASD_All_LD' for i in df.columns if 'PRS' in i})

# df = df.merge(imputed_PRS,left_on='ID',right_on=imputed_PRS.index)
df = df.merge(PRS_ASD_EuroLD,left_on='ID',right_on=PRS_ASD_EuroLD.index)
# df = df.merge(PRS_SCZ_EuroLD,left_on='ID',right_on=PRS_SCZ_EuroLD.index)


# df = df.merge(PRS_ASD_EuroLD_imputed,left_on='ID',right_on=PRS_ASD_EuroLD_imputed.index)

In [8]:
df.shape

(205, 4531)

In [9]:
# # plt.rcParams['font.family'] = 'sans-serif'
# plt.rcParams['patch.edgecolor'] = 'none'
# plt.rcParams["axes.grid.axis"] ="y"
# sns.set_style('whitegrid')
# g = sns.FacetGrid(mixed_volumetric_dataset_term[['GA_vol','PMA_vol','cohort']],col='cohort',height = 4)
# g.map(sns.histplot,'GA_vol',label='GA at birth',color='darkblue',alpha=0.15,shrink=.8)
# g.map(sns.histplot,'PMA_vol',label='PMA at scan',color='darkblue',alpha=0.5,shrink=.8)
# g.set_ylabels('Number of subjects',fontsize=15)
# g.set_xlabels('[weeks]',fontsize=15,fontname='Helvetica')
# g.axes[0][0].set_yticklabels(([int(i) for i in g.axes[0][0].get_yticks()]),size=15)
# g.axes[0][0].set_xticklabels(([int(i) for i in g.axes[0][0].get_xticks()]),size=15)
# g.axes[0][1].set_xticklabels(([int(i) for i in g.axes[0][0].get_xticks()]),size=15)


# g.axes[0][0].set_title('cohort = European',size=15)
# g.axes[0][1].set_title('cohort = Asian',size=15)
# plt.legend(ncol=2,loc='upper center',fontsize=15,frameon=False)
# sns.despine(bottom=True,left=True)
# # g.savefig(f'./{preprocessed_dataset_path}/output_plot/Cohort_age_distribution.pdf',transparent=True)

## AAL atlas

In [10]:
combined = df[[
    i for i in df.columns
    if bool(re.search('AAL \d', i))
]].copy()
combined = df[[i for i in df.columns if ('Euro' in i)]]
correlation_matrix = np.corrcoef(combined, rowvar=False)
_ = data_exploration.matSpDLite(correlation_matrix, alpha=0.05)

Effective Number of Independent Variables [Veff] is 9.12751237367706
Effective Number of Independent Variables [VeffLi] (Using equation 5 of Li and Ji 2005) is 7.0
The adjusted multiple testing correction p-val is alpha/lower(Meff) = 0.007142857142857143


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [22]:
volumes_names = [
    i for i in df.columns
    if bool(re.search('AAL \d',i))
]
prs_cols = [i for i in df.columns if 'Euro' in i]
ancestry = ['euro_Anc_PC' + str(i) for i in range(1, 4)]

In [23]:
X = df[volumes_names].copy()
y = df[prs_cols].copy()
covariates = df[['Gender','PMA_vol','GA_vol','17 ICV (all except bone)']+ancestry]

In [14]:
def splitting_dataset_into_k_fold(X,y,k=5):
    """
    For nested_CV, divide the dataset into inner and outer folds
    Args:
        X= dataset
        y= label
        k= folds number
        corr_target (bool): if true, provide 
    return
        generator of format
            (X_train,y_train,X_test, y_test)
    """
    outer_cv=KFold(n_splits=k)
    for trainval_index,test_index in outer_cv.split(X):
        X_trainval=X[trainval_index,:]
        y_trainval=y[trainval_index]
        X_test=X[test_index,:]
        y_test=y[test_index]
        yield (X_trainval,y_trainval,X_test,y_test)

In [15]:
#create inner and outer folds
outer_cv = KFold(n_splits = 5)

In [18]:
train_val,test = next(outer_cv.split(X,y))

In [21]:
train_val

array([ 41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204])

In [24]:
prs_cols

['PRS_1e-08_ASD_Euro_LD',
 'PRS_1e-07_ASD_Euro_LD',
 'PRS_1e-06_ASD_Euro_LD',
 'PRS_1e-05_ASD_Euro_LD',
 'PRS_0.0001_ASD_Euro_LD',
 'PRS_0.001_ASD_Euro_LD',
 'PRS_0.01_ASD_Euro_LD',
 'PRS_0.05_ASD_Euro_LD',
 'PRS_0.1_ASD_Euro_LD',
 'PRS_0.5_ASD_Euro_LD',
 'PRS_1_ASD_Euro_LD']

In [25]:
_, model_summary = data_exploration.MassUnivariate.mass_univariate(df,
                                               cat_independentVar_cols=['Gender'],
                                               cont_independentVar_cols=['PMA_vol','GA_vol','17 ICV (all except bone)']+ancestry+volumes_names,
                                               dependentVar_cols=prs_cols)

In [26]:

model_summary

Unnamed: 0,const_coef,const_pval,PMA_vol_coef,PMA_vol_pval,GA_vol_coef,GA_vol_pval,17 ICV (all except bone)_coef,17 ICV (all except bone)_pval,euro_Anc_PC1_coef,euro_Anc_PC1_pval,...,AAL 90_coef,AAL 90_pval,AAL 91_coef,AAL 91_pval,AAL 92_coef,AAL 92_pval,AAL 109_coef,AAL 109_pval,Gender_2.0_coef,Gender_2.0_pval
PRS_1e-08_ASD_Euro_LD,-0.008434,0.947288,-0.098772,0.707746,0.05897,0.684752,0.87193,0.407716,0.01191,0.90685,...,-0.486372,0.078225,0.010765,0.984057,-0.016653,0.977029,-0.149894,0.521425,0.017825,0.937033
PRS_1e-07_ASD_Euro_LD,0.071021,0.591695,0.055851,0.838019,-0.097658,0.517092,-0.987865,0.365953,-0.032344,0.759372,...,-0.227587,0.424139,-0.151974,0.785659,0.308218,0.607611,-0.086378,0.721492,-0.150095,0.521685
PRS_1e-06_ASD_Euro_LD,0.055065,0.676844,0.159043,0.560022,-0.091102,0.544806,0.606435,0.577716,0.087502,0.407081,...,-0.161727,0.569041,-0.029952,0.957168,0.128423,0.830184,0.064709,0.789024,-0.116375,0.618535
PRS_1e-05_ASD_Euro_LD,0.288994,0.035147,0.240128,0.392284,-0.056364,0.71527,0.580345,0.604105,-0.021384,0.843464,...,-0.240559,0.41018,0.450782,0.432207,-0.49434,0.422439,-0.106879,0.667224,-0.610761,0.012209
PRS_0.0001_ASD_Euro_LD,0.330395,0.010199,0.222727,0.394843,0.021796,0.879736,1.214338,0.245908,-0.111697,0.27008,...,-0.343882,0.207796,-0.623584,0.24484,0.898681,0.119459,-0.207518,0.371406,-0.698257,0.002292
PRS_0.001_ASD_Euro_LD,0.005524,0.964004,0.148302,0.557622,-0.031155,0.823055,2.853537,0.005499,-0.152791,0.119836,...,-0.209043,0.427406,-1.196456,0.022267,1.380711,0.014199,-0.310027,0.168453,-0.011674,0.956994
PRS_0.01_ASD_Euro_LD,0.079743,0.540694,0.338918,0.209224,-0.048732,0.742386,2.221946,0.040446,-0.093377,0.369759,...,0.189455,0.498857,-0.341572,0.5349,0.265215,0.653445,-0.128004,0.591619,-0.168529,0.464941
PRS_0.05_ASD_Euro_LD,-0.056002,0.660193,0.138898,0.59747,-0.029916,0.836457,2.080719,0.049481,-0.068114,0.502989,...,0.313626,0.253128,-0.181621,0.735573,0.440994,0.445507,-0.259182,0.267601,0.118355,0.599413
PRS_0.1_ASD_Euro_LD,-0.060939,0.63252,0.093138,0.72328,-0.053342,0.71298,1.604311,0.128566,-0.07245,0.476421,...,0.27237,0.320778,-0.151823,0.777766,0.294037,0.610824,-0.118997,0.610057,0.128788,0.567822
PRS_0.5_ASD_Euro_LD,-0.018002,0.886193,0.307189,0.238386,-0.106937,0.455637,0.813877,0.43305,-0.1366,0.175344,...,0.188223,0.486638,-0.269402,0.612197,0.189248,0.740032,-0.03263,0.887316,0.038045,0.864209


In [27]:
data_exploration.MassUnivariate.get_model_summary(df,
                                               cat_independentVar_cols=['Gender'],
                                               cont_independentVar_cols=['PMA_vol','GA_vol','17 ICV (all except bone)']+ancestry+volumes_names,
                                               dependentVar_cols=['PRS_0.0001_ASD_Euro_LD'])

Unnamed: 0,beta_coefs,pvalues,Rsquared
const,0.330395,0.010199,0.032710
PMA_vol,0.222727,0.394843,0.003487
GA_vol,0.021796,0.879736,0.000110
17 ICV (all except bone),1.214338,0.245908,0.006505
euro_Anc_PC1,-0.111697,0.270080,0.005873
...,...,...,...
AAL 90,-0.343882,0.207796,0.007675
AAL 91,-0.623584,0.244840,0.006535
AAL 92,0.898681,0.119459,0.011775
AAL 109,-0.207518,0.371406,0.003850
