In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import utils.constants as constants

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'mashr')

# Load all S-PrediXcan gene mappings

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

In [5]:
genes_mappings = pd.read_pickle(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mappings.pkl'))

In [6]:
genes_mappings.head()

Unnamed: 0,gene,genename,gene_type,gene_id,band
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457,1q24.2
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460,1q24.2
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938,1p35.3
3,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971,1q31.3
4,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036,6q24.2


# Load S-PrediXcan results

In [7]:
smultixcan_pvalues_file = os.path.join(constants.DELIVERABLES_RESULTS_DIR, 'internal', 'smultixcan-mash_models-pvalues.pkl.xz')
display(smultixcan_pvalues_file)

'/mnt/phenomexcan/results/deliverables/internal/smultixcan-mash_models-pvalues.pkl.xz'

In [8]:
smultixcan_gene_associations = pd.read_pickle(smultixcan_pvalues_file)

In [9]:
smultixcan_gene_associations.shape

(22515, 4091)

In [10]:
smultixcan_gene_associations.head(5)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.9812,0.403348,0.862788,0.024683,0.561934,0.956913,0.551198,0.516344,0.227106,0.129847,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.757443,0.282676,0.645016,0.605318,0.76403,0.019528,0.814719,0.868149,0.688368,0.049198,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.101185,0.465145,0.847302,0.928796,0.64585,0.513639,0.263445,0.661105,0.102007,0.477268,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.579861,0.905545,0.395741,0.730425,0.269117,0.266458,0.710789,0.404122,0.953161,0.605097,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.639334,0.336059,0.204268,0.176063,0.743292,0.109981,0.908151,0.442417,0.829314,0.333868,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


In [11]:
assert not smultixcan_gene_associations.isin([np.inf, -np.inf]).any().any()

In [12]:
smultixcan_gene_associations.isna().sum().sum()

1053055

In [15]:
assert ((smultixcan_gene_associations.min().min() > 0) & (smultixcan_gene_associations.max().max() <= 1)).all().all()

# Load fastENLOC results

In [16]:
fastenloc_rcp_file = os.path.join(constants.DELIVERABLES_RESULTS_DIR, 'internal', 'fastenloc-torus-rcp.pkl.xz')
display(fastenloc_rcp_file)

'/mnt/phenomexcan/results/deliverables/internal/fastenloc-torus-rcp.pkl.xz'

In [17]:
fastenloc_gene_associations = pd.read_pickle(fastenloc_rcp_file)

In [18]:
fastenloc_gene_associations.shape

(37967, 4091)

In [19]:
fastenloc_gene_associations.head(5)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.000475,2e-06,7e-06,1.4e-05,4.072e-07,2.9e-05,3.644e-07,0.001523,1.4547e-07,9.367e-08,...,0.0,,0.001,0.0,,,0.001,,,0.001
ENSG00000000457,0.001202,7e-06,0.000107,6.4e-05,1.2358e-06,0.000932,4.70031e-06,0.004699,4.312066e-07,2.43052e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000460,0.003668,1e-05,8.2e-05,4.8e-05,8.3935e-06,0.001715,4.33512e-06,0.004166,4.44508e-07,1.50068e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000938,0.001505,8e-06,4.9e-05,5.5e-05,1.8095e-07,0.000441,2.8667e-07,0.005567,8.714e-08,5.5806e-07,...,,,,0.002,,,0.001,,,
ENSG00000000971,0.001648,1.4e-05,0.000117,0.000197,1.263e-06,0.002731,2.12097e-06,0.009212,3.326e-09,2.01584e-07,...,,,,,,,0.0,,,0.0


In [21]:
fastenloc_gene_associations.isna().sum().sum()

3341082

In [22]:
assert not fastenloc_gene_associations.isin([np.inf, -np.inf]).any().any()

In [24]:
assert fastenloc_gene_associations.min().min() >= 0
assert fastenloc_gene_associations.max().max() <= 3

# S-MultiXcan

## Counts of significant associations

In [25]:
smultixcan_gene_associations.shape

(22515, 4091)

In [26]:
all_pvalues = pd.Series(smultixcan_gene_associations.values.flatten()).dropna()
display(all_pvalues.shape)

(91055810,)

In [27]:
all_pvalues.describe()

count     9.105581e+07
mean      4.839226e-01
std       2.934763e-01
min      9.999889e-321
25%       2.266266e-01
50%       4.794625e-01
75%       7.378647e-01
max       1.000000e+00
dtype: float64

In [28]:
(-np.log10(all_pvalues)).describe()

count    9.105581e+07
mean     4.990325e-01
std      1.258476e+00
min     -0.000000e+00
25%      1.320233e-01
50%      3.192453e-01
75%      6.446892e-01
max      3.200000e+02
dtype: float64

In [30]:
PVALUE_THRESHOLD = (0.05 / all_pvalues.shape[0])
display(PVALUE_THRESHOLD)

5.491137797796759e-10

In [32]:
PVALUE_THRESHOLD = 5.49e-10

In [33]:
hits = (all_pvalues < PVALUE_THRESHOLD).sum()
display(hits)
display((hits / all_pvalues.shape[0]) * 100)

72994

0.08016402248247531

In [34]:
hits = (all_pvalues < 0.01).sum()
display(hits)
display((hits / all_pvalues.shape[0]) * 100)

1809587

1.9873383148203285

# fastENLOC

## Numbers of significant genes also found by fastENLOC

In [35]:
common_genes = fastenloc_gene_associations.index.intersection(smultixcan_gene_associations.index)
display(common_genes)

Index(['ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460',
       'ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036',
       'ENSG00000001084', 'ENSG00000001167', 'ENSG00000001460',
       'ENSG00000001461',
       ...
       'ENSG00000284240', 'ENSG00000284308', 'ENSG00000284395',
       'ENSG00000284413', 'ENSG00000284418', 'ENSG00000284430',
       'ENSG00000284452', 'ENSG00000284513', 'ENSG00000284526',
       'ENSG00000284552'],
      dtype='object', name='gene_name', length=22427)

In [36]:
fastenloc_matrix_sorted = fastenloc_gene_associations.loc[common_genes, smultixcan_gene_associations.columns]
display(fastenloc_matrix_sorted.shape)
display(fastenloc_matrix_sorted.head())

(22427, 4091)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.000475,2e-06,7e-06,1.4e-05,4.072e-07,2.9e-05,3.644e-07,0.001523,1.4547e-07,9.367e-08,...,0.0,,0.001,0.0,,,0.001,,,0.001
ENSG00000000457,0.001202,7e-06,0.000107,6.4e-05,1.2358e-06,0.000932,4.70031e-06,0.004699,4.312066e-07,2.43052e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000460,0.003668,1e-05,8.2e-05,4.8e-05,8.3935e-06,0.001715,4.33512e-06,0.004166,4.44508e-07,1.50068e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000938,0.001505,8e-06,4.9e-05,5.5e-05,1.8095e-07,0.000441,2.8667e-07,0.005567,8.714e-08,5.5806e-07,...,,,,0.002,,,0.001,,,
ENSG00000000971,0.001648,1.4e-05,0.000117,0.000197,1.263e-06,0.002731,2.12097e-06,0.009212,3.326e-09,2.01584e-07,...,,,,,,,0.0,,,0.0


In [37]:
multixcan_matrix_sorted = smultixcan_gene_associations.loc[common_genes]
display(multixcan_matrix_sorted.shape)
display(multixcan_matrix_sorted.head())

(22427, 4091)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.9812,0.403348,0.862788,0.024683,0.561934,0.956913,0.551198,0.516344,0.227106,0.129847,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.757443,0.282676,0.645016,0.605318,0.76403,0.019528,0.814719,0.868149,0.688368,0.049198,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.101185,0.465145,0.847302,0.928796,0.64585,0.513639,0.263445,0.661105,0.102007,0.477268,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.579861,0.905545,0.395741,0.730425,0.269117,0.266458,0.710789,0.404122,0.953161,0.605097,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.639334,0.336059,0.204268,0.176063,0.743292,0.109981,0.908151,0.442417,0.829314,0.333868,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


In [38]:
assert fastenloc_matrix_sorted.shape == multixcan_matrix_sorted.shape

### Using stringent pvalue threshold

In [39]:
display(PVALUE_THRESHOLD)

5.49e-10

In [40]:
multixcan_signif = (multixcan_matrix_sorted < PVALUE_THRESHOLD)
display(multixcan_signif.shape)
display(multixcan_signif.head())

(22427, 4091)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000457,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000460,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000938,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000971,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [41]:
fastenloc_signif = (fastenloc_matrix_sorted > 0.1)
display(fastenloc_signif.shape)
display(fastenloc_signif.head())

(22427, 4091)

Unnamed: 0_level_0,D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,C54-Diagnoses_main_ICD10_C54_Malignant_neoplasm_of_corpus_uteri,N35-Diagnoses_main_ICD10_N35_Urethral_stricture,20521-Belittlement_by_partner_or_expartner_as_an_adult,O72-Diagnoses_main_ICD10_O72_Postpartum_haemorrhage,20547_1-Activities_undertaken_to_treat_depression_Talking_therapies_such_as_psychotherapy_counselling_group_therapy_or_CBT,22601_21133023-Job_coding_physicist_astronomer_geologist_geophysicist_meteorologist_oceanographer_seismologist,6138_3-Qualifications_O_levelsGCSEs_or_equivalent,3446_1-Type_of_tobacco_currently_smoked_Manufactured_cigarettes,24011_raw-Traffic_intensity_on_the_nearest_major_road,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000457,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000460,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000938,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ENSG00000000971,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [42]:
multixcan_hits = multixcan_signif.sum().sum()
display(multixcan_hits)
display(multixcan_hits / multixcan_signif.size)

72990

0.000795541245816283

In [43]:
fastenloc_hits = fastenloc_signif.sum().sum()
display(fastenloc_hits)
display(fastenloc_hits / fastenloc_signif.size)

292506

0.0031881160110801164

In [45]:
fastenloc_and_multixcan_signif = (multixcan_signif & fastenloc_signif)
assert fastenloc_and_multixcan_signif.shape == multixcan_signif.shape == fastenloc_signif.shape

In [46]:
fe_mu_sum = fastenloc_and_multixcan_signif.sum().sum()
display(fe_mu_sum)
_perc = fe_mu_sum / multixcan_hits
display(_perc)
display(f'{(_perc * 100):.2f}%')

22473

0.30789149198520344

'30.79%'

# Export significant associations as a table

In [47]:
# s-multixcan
smultixcan_results = multixcan_matrix_sorted.unstack()

In [48]:
smultixcan_results.shape

(91748857,)

In [49]:
smultixcan_results.head()

                                            gene_name      
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia  ENSG00000000419    0.981200
                                            ENSG00000000457    0.757443
                                            ENSG00000000460    0.101185
                                            ENSG00000000938    0.579861
                                            ENSG00000000971    0.639334
dtype: float64

In [50]:
smultixcan_results.isna().sum()

1049005

In [51]:
smultixcan_results = smultixcan_results.dropna()

In [52]:
smultixcan_results.shape

(90699852,)

In [53]:
assert smultixcan_results.isna().sum() == 0

In [54]:
(smultixcan_results < PVALUE_THRESHOLD).sum()

72990

In [55]:
# fastenloc
fastenloc_results = fastenloc_matrix_sorted.unstack()

In [56]:
fastenloc_results.shape

(91748857,)

In [57]:
fastenloc_results.head()

                                            gene_name      
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia  ENSG00000000419    0.000475
                                            ENSG00000000457    0.001202
                                            ENSG00000000460    0.003668
                                            ENSG00000000938    0.001505
                                            ENSG00000000971    0.001648
dtype: float64

In [58]:
fastenloc_results.isna().sum()

1554545

In [59]:
fastenloc_results = fastenloc_results.dropna()

In [60]:
fastenloc_results.shape

(90194312,)

In [61]:
final_results = pd.DataFrame(index=smultixcan_results.index.copy(), columns=['smultixcan_pval', 'fastenloc_rcp'])

In [None]:
final_results = final_results.assign(smultixcan_pval=smultixcan_results, fastenloc_rcp=fastenloc_results)

In [66]:
final_results.shape

(90699852, 2)

In [67]:
final_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,smultixcan_pval,fastenloc_rcp
Unnamed: 0_level_1,gene_name,Unnamed: 2_level_1,Unnamed: 3_level_1
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,ENSG00000000419,0.9812,0.000475
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,ENSG00000000457,0.757443,0.001202
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,ENSG00000000460,0.101185,0.003668
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,ENSG00000000938,0.579861,0.001505
D3_ANAEMIA_IRONDEF-Iron_deficiency_anaemia,ENSG00000000971,0.639334,0.001648


In [68]:
final_results.isna().sum()

smultixcan_pval          0
fastenloc_rcp      1529981
dtype: int64

In [69]:
final_results = final_results.dropna(how='any')

In [70]:
final_results.shape

(89169871, 2)

In [71]:
PVALUE_THRESHOLD

5.49e-10

In [72]:
publishable_final_results = final_results[(final_results['smultixcan_pval'] < PVALUE_THRESHOLD) & (final_results['fastenloc_rcp'] > 0.1)]

In [73]:
publishable_final_results.index.rename(('trait', 'gene_id'), inplace=True)

In [74]:
publishable_final_results.shape

(22473, 2)

In [75]:
publishable_final_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,smultixcan_pval,fastenloc_rcp
trait,gene_id,Unnamed: 2_level_1,Unnamed: 3_level_1
VII_EYE_ADNEXA-Diseases_of_the_eye_and_adnexa,ENSG00000261070,4.689794e-11,0.4111
5135_raw-3mm_strong_meridian_left,ENSG00000049449,5.54295e-12,0.1817
5135_raw-3mm_strong_meridian_left,ENSG00000050438,1.426052e-13,0.953802
5135_raw-3mm_strong_meridian_left,ENSG00000087116,4.002974e-10,0.359875
5135_raw-3mm_strong_meridian_left,ENSG00000107404,1.450243e-18,0.3923


In [76]:
genes_mappings.head()

Unnamed: 0,gene,genename,gene_type,gene_id,band
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457,1q24.2
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460,1q24.2
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938,1p35.3
3,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971,1q31.3
4,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036,6q24.2


In [77]:
_tmp = pd.merge(
    publishable_final_results.reset_index(),
    genes_mappings[['gene_id', 'genename', 'band', 'gene_type']],
    on='gene_id',
    how='left'
).rename(columns={'genename': 'gene_name', 'band': 'gene_band'})

In [78]:
_tmp.head()

Unnamed: 0,trait,gene_id,smultixcan_pval,fastenloc_rcp,gene_name,gene_band,gene_type
0,VII_EYE_ADNEXA-Diseases_of_the_eye_and_adnexa,ENSG00000261070,4.689794e-11,0.4111,RP11-554A11.8,11q13.3,lincRNA
1,5135_raw-3mm_strong_meridian_left,ENSG00000049449,5.54295e-12,0.1817,RCN1,11p13,protein_coding
2,5135_raw-3mm_strong_meridian_left,ENSG00000050438,1.426052e-13,0.953802,SLC4A8,12q13.13,protein_coding
3,5135_raw-3mm_strong_meridian_left,ENSG00000087116,4.002974e-10,0.359875,ADAMTS2,5q35.3,protein_coding
4,5135_raw-3mm_strong_meridian_left,ENSG00000107404,1.450243e-18,0.3923,DVL1,1p36.33,protein_coding


In [79]:
publishable_final_results = _tmp.set_index(['trait', 'gene_id'])[['gene_name', 'gene_band', 'gene_type', 'smultixcan_pval', 'fastenloc_rcp']]

In [80]:
publishable_final_results = publishable_final_results.sort_index()

In [82]:
assert publishable_final_results.index.is_unique

In [95]:
publishable_final_results.isna().sum()

gene_name          11
gene_band          11
gene_type          11
smultixcan_pval     0
fastenloc_rcp       0
dtype: int64

In [96]:
publishable_final_results[publishable_final_results['gene_name'].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name,gene_band,gene_type,smultixcan_pval,fastenloc_rcp
trait,gene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23101_raw-Whole_body_fatfree_mass,ENSG00000253972,,,,1.24621e-13,0.1933
23102_raw-Whole_body_water_mass,ENSG00000253972,,,,1.868528e-13,0.1813
23105_raw-Basal_metabolic_rate,ENSG00000253972,,,,6.70107e-11,0.1692
23121_raw-Arm_fatfree_mass_right,ENSG00000253972,,,,1.262021e-10,0.14
23122_raw-Arm_predicted_mass_right,ENSG00000253972,,,,1.539772e-10,0.1702
23125_raw-Arm_fatfree_mass_left,ENSG00000253972,,,,1.553128e-11,0.1275
23126_raw-Arm_predicted_mass_left,ENSG00000253972,,,,2.576215e-11,0.1144
23129_raw-Trunk_fatfree_mass,ENSG00000253972,,,,3.512784e-15,0.1968
23130_raw-Trunk_predicted_mass,ENSG00000253972,,,,5.304944e-15,0.2065
30090_raw-Platelet_crit,ENSG00000229694,,,,1.379899e-10,0.4605


In [83]:
publishable_final_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name,gene_band,gene_type,smultixcan_pval,fastenloc_rcp
trait,gene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102_raw-Pulse_rate_automated_reading,ENSG00000031003,FAM13B,5q31.2,protein_coding,9.316678e-29,0.8686
102_raw-Pulse_rate_automated_reading,ENSG00000064989,CALCRL,2q32.1,protein_coding,1.553027e-22,0.8632
102_raw-Pulse_rate_automated_reading,ENSG00000072195,SPEG,2q35,protein_coding,1.707185e-38,0.877
102_raw-Pulse_rate_automated_reading,ENSG00000075420,FNDC3B,3q26.31,protein_coding,2.695478e-24,0.8411
102_raw-Pulse_rate_automated_reading,ENSG00000075702,WDR62,19q13.12,protein_coding,8.858931e-11,1.257727


In [84]:
publishable_final_results.shape

(22473, 5)

In [85]:
# some testing
_result = publishable_final_results.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['VIP'])]
assert _result['smultixcan_pval'] == 1.8124965657173678e-17
assert _result['fastenloc_rcp'] == 0.30421

In [86]:
# some testing
_result = publishable_final_results.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['RP11-220I1.5'])]
assert _result['smultixcan_pval'] == 6.426619359444203e-11
assert _result['fastenloc_rcp'] == 0.2150000007971

In [87]:
# for publication (tsv.gz)
output_file = os.path.join(constants.DELIVERABLES_RESULTS_DIR, 'supp_tables', 'suppl_table_S1-significant_gene_trait_associations.tsv.gz')
display(output_file)

publishable_final_results.to_csv(output_file, sep='\t', float_format='%.4e')

'/mnt/phenomexcan/results/deliverables/supp_tables/suppl_table_S1-significant_gene_trait_associations.tsv.gz'

In [88]:
# some testing
_tmp = pd.read_csv(output_file, sep='\t')

In [89]:
assert publishable_final_results.shape == _tmp.set_index(['trait', 'gene_id']).shape

In [90]:
_tmp.shape

(22473, 7)

In [91]:
_tmp.head()

Unnamed: 0,trait,gene_id,gene_name,gene_band,gene_type,smultixcan_pval,fastenloc_rcp
0,102_raw-Pulse_rate_automated_reading,ENSG00000031003,FAM13B,5q31.2,protein_coding,9.316700000000001e-29,0.8686
1,102_raw-Pulse_rate_automated_reading,ENSG00000064989,CALCRL,2q32.1,protein_coding,1.553e-22,0.8632
2,102_raw-Pulse_rate_automated_reading,ENSG00000072195,SPEG,2q35,protein_coding,1.7071999999999998e-38,0.877
3,102_raw-Pulse_rate_automated_reading,ENSG00000075420,FNDC3B,3q26.31,protein_coding,2.6955e-24,0.8411
4,102_raw-Pulse_rate_automated_reading,ENSG00000075702,WDR62,19q13.12,protein_coding,8.8589e-11,1.2577


In [92]:
_tmp = _tmp.set_index(['trait', 'gene_id'])

In [93]:
_result = _tmp.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['VIP'])]
assert _result['smultixcan_pval'] == 1.8125e-17
assert _result['fastenloc_rcp'] == 0.30421

_result = _tmp.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['RP11-220I1.5'])]
assert _result['smultixcan_pval'] == 6.4266e-11
assert _result['fastenloc_rcp'] == 0.2150

In [103]:
assert np.allclose(
    _tmp[['smultixcan_pval', 'fastenloc_rcp']].values,
    publishable_final_results[['smultixcan_pval', 'fastenloc_rcp']].values,
    atol=1e-320,
    rtol=1e-4
)

In [104]:
# for publication (xlsx)
output_file = os.path.join(constants.DELIVERABLES_RESULTS_DIR, 'supp_tables', 'suppl_table_S1-significant_gene_trait_associations.xlsx')
display(output_file)

publishable_final_results.reset_index().to_excel(output_file, index=False, float_format='%.4e')

'/mnt/phenomexcan/results/deliverables/supp_tables/suppl_table_S1-significant_gene_trait_associations.xlsx'

In [105]:
# some testing
_tmp = pd.read_excel(output_file)

In [106]:
assert publishable_final_results.shape == _tmp.set_index(['trait', 'gene_id']).shape

In [107]:
_tmp.shape

(22473, 7)

In [108]:
_tmp.head()

Unnamed: 0,trait,gene_id,gene_name,gene_band,gene_type,smultixcan_pval,fastenloc_rcp
0,102_raw-Pulse_rate_automated_reading,ENSG00000031003,FAM13B,5q31.2,protein_coding,9.316700000000001e-29,0.8686
1,102_raw-Pulse_rate_automated_reading,ENSG00000064989,CALCRL,2q32.1,protein_coding,1.553e-22,0.8632
2,102_raw-Pulse_rate_automated_reading,ENSG00000072195,SPEG,2q35,protein_coding,1.7071999999999998e-38,0.877
3,102_raw-Pulse_rate_automated_reading,ENSG00000075420,FNDC3B,3q26.31,protein_coding,2.6955e-24,0.8411
4,102_raw-Pulse_rate_automated_reading,ENSG00000075702,WDR62,19q13.12,protein_coding,8.8589e-11,1.2577


In [109]:
_tmp = _tmp.set_index(['trait', 'gene_id'])

In [110]:
_result = _tmp.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['VIP'])]
assert _result['smultixcan_pval'] == 1.8125e-17
assert _result['fastenloc_rcp'] == 0.30421

_result = _tmp.loc[('1180-Morningevening_person_chronotype', genes_mapping_1['RP11-220I1.5'])]
assert _result['smultixcan_pval'] == 6.4266e-11
assert _result['fastenloc_rcp'] == 0.2150

In [111]:
assert np.allclose(
    _tmp[['smultixcan_pval', 'fastenloc_rcp']].values,
    publishable_final_results[['smultixcan_pval', 'fastenloc_rcp']].values,
    atol=1e-320,
    rtol=1e-4
)

# QQ plots

In [None]:
import plots
from clustering.biclustering.analysis import Trait

# Cholesterol

In [None]:
spredixcan_genes_associations.columns[spredixcan_genes_associations.columns.str.lower().str.contains('cholesterol')]

In [None]:
pheno_plain_name = '20002_1473-Noncancer_illness_code_selfreported_high_cholesterol'

In [None]:
pheno_pvalues = pd.Series(spredixcan_genes_associations[pheno_plain_name].values.flatten())
display(pheno_pvalues.shape)

In [None]:
threshold = 1e-30

unif_values = np.linspace(0.0001, 1, num=pheno_pvalues.shape[0])
unif_values[unif_values < threshold] = threshold

real_values = pheno_pvalues
real_values[real_values < threshold] = threshold

df = pd.DataFrame({'unif': unif_values, 'real': real_values})
display(df.shape)

In [None]:
df_plot = df

ax = plots.qqplot(-np.log10(df_plot), 'unif', 'real', xlabel='Expected $-\log_{10}(p)$', ylabel='Observed $-\log_{10}(p)$', s=5, linewidth=0)
ax.set_xlim([0, 5])
ax.set_title(Trait(pheno_plain_name).get_pretty_name())

# Height

In [None]:
spredixcan_genes_associations.columns[spredixcan_genes_associations.columns.str.lower().str.contains('height')]

In [None]:
pheno_plain_name = '50_raw-Standing_height'

In [None]:
pheno_pvalues = pd.Series(spredixcan_genes_associations[pheno_plain_name].values.flatten())
display(pheno_pvalues.shape)

In [None]:
threshold = 1e-30

unif_values = np.linspace(0.0001, 1, num=pheno_pvalues.shape[0])
unif_values[unif_values < threshold] = threshold

real_values = pheno_pvalues
real_values[real_values < threshold] = threshold

df = pd.DataFrame({'unif': unif_values, 'real': real_values})
display(df.shape)

In [None]:
df_plot = df

ax = plots.qqplot(-np.log10(df_plot), 'unif', 'real', xlabel='Expected $-\log_{10}(p)$', ylabel='Observed $-\log_{10}(p)$', s=5, linewidth=0)
ax.set_xlim([0, 5])
ax.set_title(Trait(pheno_plain_name).get_pretty_name())

# Schizophrenia

In [None]:
spredixcan_genes_associations.columns[spredixcan_genes_associations.columns.str.lower().str.contains('schizophrenia')]

In [None]:
pheno_plain_name = '20002_1289-Noncancer_illness_code_selfreported_schizophrenia'

In [None]:
pheno_pvalues = pd.Series(spredixcan_genes_associations[pheno_plain_name].values.flatten())
display(pheno_pvalues.shape)

In [None]:
threshold = 1e-30

unif_values = np.linspace(0.0001, 1, num=pheno_pvalues.shape[0])
unif_values[unif_values < threshold] = threshold

real_values = pheno_pvalues
real_values[real_values < threshold] = threshold

df = pd.DataFrame({'unif': unif_values, 'real': real_values})
display(df.shape)

In [None]:
df_plot = df

ax = plots.qqplot(-np.log10(df_plot), 'unif', 'real', xlabel='Expected $-\log_{10}(p)$', ylabel='Observed $-\log_{10}(p)$', s=5, linewidth=0)
ax.set_xlim([0, 5])
ax.set_title(Trait(pheno_plain_name).get_pretty_name())

# All p-values

In [None]:
threshold = 1e-30

unif_values = np.linspace(0.0001, 1, num=all_pvalues.shape[0])
unif_values[unif_values < threshold] = threshold

real_values = all_pvalues
real_values[real_values < threshold] = threshold

df = pd.DataFrame({'unif': unif_values, 'real': real_values})

In [None]:
df_plot = df

ax = plots.qqplot(-np.log10(df_plot), 'unif', 'real', xlabel='Expected $-\log_{10}(p)$', ylabel='Observed $-\log_{10}(p)$', s=5, linewidth=0)
ax.set_xlim([0, 5])