In [1]:
import sys
sys.path.append('../../')

In [2]:
preprocessed_dataset_path = 'preprocessed_dataset/'

In [3]:
from codes.docs.analysis import data_preprocessing, data_exploration, visualisation, data_preprocessing_high_dimension,model_training
from codes.docs.analysis.gene_analysis import genetic_file_preprocess

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
%matplotlib inline
import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as sfm

from scipy.stats import ttest_ind, pearsonr,zscore
import pandas as pd
import re

Here, the volumetric dataset is preprocessed to group the same brain regions.\
Then mass univariate test is preformed, across multiple regions.

# Main analysis

In [5]:
european_volumetric_dataset_term = pd.read_csv(
    preprocessed_dataset_path +
    'imaging_data/volumetric/term/european_volumetric_df_batch2.csv')

## AAL atlas

### performing individual volumes

#### European

In [23]:
df = european_volumetric_dataset_term.copy()
ancestry = [f'euro_Anc_PC{i}' for i in range(1,4)]
dependentVar = [
    i for i in european_volumetric_dataset_term.columns
    if bool(re.search('AAL \d',i))
]
df = df.rename(columns = {'AAL 93':'AAL 109'})
# df = df.rename(columns={i:i+'_ASD_All_LD' for i in df.columns if 'PRS' in i})


In [24]:
combined = df[[
    i for i in df.columns
    if bool(re.search('AAL \d', i))
]].copy()
# combined = df[[i for i in df.columns if 'imputed' in i]]
correlation_matrix = np.corrcoef(combined, rowvar=False)
_ = data_exploration.matSpDLite(correlation_matrix, alpha=0.05)

Effective Number of Independent Variables [Veff] is 48.27990074715493
Effective Number of Independent Variables [VeffLi] (Using equation 5 of Li and Ji 2005) is 28.0
The adjusted multiple testing correction p-val is alpha/lower(Meff) = 0.0017857142857142841


In [27]:
thresholds = [i for i in df.columns if 'PRS' in i]
dependentVar_cols=[i for i in df.columns if bool(re.search('AAL \d',i))]

european_volumetric_term_mass_univariate = data_exploration.MassUnivariate.calculate_mass_univariate_across_multiple_thresholds(
    df,
    thresholds=thresholds,
    cat_independentVar_cols=['Gender'],
    cont_independentVar_cols=['GA_vol', 'PMA_vol','17 ICV (all except bone)']+ancestry,
    dependentVar_cols=dependentVar_cols)

100%|███████████████████████████████████████████| 11/11 [00:02<00:00,  3.69it/s]


In [26]:
european_volumetric_term_mass_univariate.sort_values(by='PRS_pval')

Unnamed: 0,Connection,const_coef,const_pval,GA_vol_coef,GA_vol_pval,PMA_vol_coef,PMA_vol_pval,17 ICV (all except bone)_coef,17 ICV (all except bone)_pval,euro_Anc_PC1_coef,euro_Anc_PC1_pval,euro_Anc_PC2_coef,euro_Anc_PC2_pval,euro_Anc_PC3_coef,euro_Anc_PC3_pval,PRS_coef,PRS_pval,Gender_male_coef,Gender_male_pval,threshold
7,head_circumference_scan,33.587928,1.353118e-119,0.611852,0.338604,-0.108091,0.909044,-0.305808,0.706382,0.188245,0.711015,-0.128368,0.797575,0.762025,0.134817,-0.667482,0.178267,0.551232,0.601332,PRS_0.05
3,head_circumference_scan,33.75419,4.0626590000000006e-119,0.698066,0.276823,-0.131121,0.889967,-0.3618,0.655725,0.309492,0.541437,-0.18636,0.708171,0.766558,0.133243,0.516056,0.299822,0.232996,0.828002,PRS_1e-05
8,head_circumference_scan,33.621261,1.368755e-119,0.652205,0.308391,-0.126792,0.893628,-0.334705,0.680593,0.200852,0.694376,-0.138909,0.783039,0.772036,0.13093,-0.454515,0.362363,0.487429,0.644134,PRS_0.1
10,head_circumference_scan,33.611468,1.7983830000000003e-119,0.661329,0.301928,-0.081943,0.93119,-0.354332,0.662721,0.186622,0.71701,-0.135402,0.789116,0.776551,0.129042,-0.434477,0.391908,0.506174,0.631999,PRS_1
9,head_circumference_scan,33.615289,1.7669820000000002e-119,0.666484,0.298451,-0.098406,0.917382,-0.344826,0.671487,0.194977,0.704568,-0.138097,0.785281,0.771468,0.131447,-0.407298,0.421661,0.498861,0.636898,PRS_0.5
2,head_circumference_scan,33.665344,1.130733e-119,0.678428,0.290478,-0.092307,0.922503,-0.403294,0.619838,0.288275,0.569156,-0.217959,0.661327,0.803919,0.118697,0.391033,0.427095,0.403053,0.702581,PRS_1e-06
5,head_circumference_scan,33.593179,3.784964e-119,0.61709,0.336208,-0.115548,0.903047,-0.374698,0.644576,0.237082,0.640434,-0.20802,0.675951,0.743324,0.145661,-0.382369,0.437336,0.54118,0.610433,PRS_0.001
4,head_circumference_scan,33.583016,6.910552e-118,0.640905,0.31756,-0.128826,0.892131,-0.354855,0.662957,0.26088,0.606609,-0.206925,0.678004,0.730342,0.154665,-0.232756,0.647083,0.560633,0.605939,PRS_0.0001
1,head_circumference_scan,33.654616,1.6397880000000002e-119,0.655484,0.307122,-0.084837,0.929136,-0.40603,0.620051,0.290634,0.569407,-0.219957,0.658867,0.769225,0.135082,0.15465,0.756917,0.423587,0.6887,PRS_1e-07
0,head_circumference_scan,33.656787,1.8076820000000002e-119,0.640004,0.318584,-0.082678,0.931003,-0.40943,0.617862,0.285373,0.575036,-0.226852,0.649589,0.771092,0.134832,0.151268,0.762133,0.419431,0.691865,PRS_1e-08


In [10]:
# prs_lds = ['All_LD','AllLDimputed','Euro_LD','EuroLDimputed']
prs_lds = ['All_LD']
for threshold in prs_lds:
    _,prs_pca,_ = data_preprocessing_high_dimension.FeatureReduction.perform_PCA(
    df = df,
    dependentVar_cols=[i for i in df.columns if threshold in i],
    scaling=True,
    n_components=1)
    for i in range(prs_pca.shape[1]):
        df[f'prs_pca_{threshold}_PC{i}'] = prs_pca[:,i]

## Imperial atlas 

In [6]:
df = european_volumetric_dataset_term.copy()
ancestry = [f'euro_Anc_PC{i}' for i in range(1, 4)]
df = data_preprocessing.Volumes.Imperial.group_Imperial_volumes(df,
                                                       grouping='segmented',
                                                       remove_duplicated=True)
dependentVar = [i for i in df.columns if bool(re.search('Imperial \d', i))]
df = data_exploration.MassUnivariate.remove_outliers(
    df,
    col=dependentVar,
    threshold=3,
    remove_schemes='percentage',
    percentage_of_outlier=0.1)
df = df.rename(columns={'AAL 93': 'AAL 109'})

####################LABELS#######################
WM_labels = data_preprocessing.Volumes.Imperial.extract_WM_Imperial(df[[
    i for i in df.columns if bool(re.search('Imperial \d', i))
]]).columns.tolist()
GM_labels = data_preprocessing.Volumes.Imperial.extract_GM_Imperial(df[[
    i for i in df.columns if bool(re.search('Imperial \d', i))
]]).columns.tolist()
DGM_labels = data_preprocessing.Volumes.Imperial.extract_deepGM_Imperial(df[[
    i for i in df.columns if bool(re.search('Imperial \d', i))
]]).columns.tolist()
bs_labels = ['Imperial 19']
cerebellum_labels = ['Imperial 17', 'Imperial 18']
ventricle_labels = [f'Imperial {i}' for i in range(49, 51)]
amygdala_label = ['Imperial 3', 'Imperial 4']
lobes = data_preprocessing.Volumes.Imperial.extract_lobe(
    df[[i for i in df.columns if bool(re.search('Imperial \d', i))]],lobes=['frontal','temporal','parietal','occipital'])
frontal_lobe = data_preprocessing.Volumes.Imperial.extract_lobe(
    df[[i for i in df.columns if bool(re.search('Imperial \d', i))]],lobes=['frontal'])
occipital_lobe = data_preprocessing.Volumes.Imperial.extract_lobe(
    df[[i for i in df.columns if bool(re.search('Imperial \d', i))]],lobes=['occipital'])
parietal_lobe  = data_preprocessing.Volumes.Imperial.extract_lobe(
    df[[i for i in df.columns if bool(re.search('Imperial \d', i))]],lobes=['parietal'])
temporal_lobe  = data_preprocessing.Volumes.Imperial.extract_lobe(
    df[[i for i in df.columns if bool(re.search('Imperial \d', i))]],lobes=['temporal'])
# cor_labels = WM_labels+GM_labels+DGM_labels + bs_labels + cerebellum_labels + ventricle_labels
cor_labels = GM_labels + WM_labels
# cor_labels = GM_labels
# cor_labels = lobes
# cor_labels = WM_labels
# cor_labels = GM_labels
# cor_labels = GM_labels + WM_labels