In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
# from src.data import PhenoInfo, PhenoResults, get_all_tissues, get_genes
from data.multixcan_data import MXPhenoInfo, MXPhenoResults
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'en_not_fixed')

output_dir = os.path.join(constants.RESULTS_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load metadata

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load MultiXcan associations

In [8]:
genes_associations_filename = os.path.join(smultixcan_gene_association_dirs, 'smultixcan-genes_associations-zscores.pkl.xz')
display(genes_associations_filename)

genes_associations = pd.read_pickle(genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/en_not_fixed/smultixcan-genes_associations-zscores.pkl.xz'

In [9]:
genes_associations.isin([np.inf, -np.inf]).any().any()

True

In [10]:
#min_pvals = pd.Series(spredixcan_genes_associations.values.flatten())
max_zscores = pd.Series(genes_associations.values.flatten())

In [11]:
_tmp = max_zscores.sort_values(ascending=False)

In [13]:
display(_tmp[~np.isinf(_tmp)].head())

20904547    37.691094
68870300    37.681572
61904608    37.601920
61616379    37.587317
59310196    37.577470
dtype: float64

In [14]:
# replace inf
genes_associations = genes_associations.replace(np.inf, 40)

assert not genes_associations.isin([np.inf, -np.inf]).any().any()

In [15]:
assert (genes_associations >= 0).all().all()

In [16]:
display(genes_associations.shape)
display(genes_associations.head())

(19910, 4077)

Unnamed: 0_level_0,20003_1141171932-Treatmentmedication_code_levetiracetam,F99-Diagnoses_main_ICD10_F99_Mental_disorder_not_otherwise_specified,4674-Private_healthcare,100670-White_wine_intake,6150_2-Vascularheart_problems_diagnosed_by_doctor_Angina,D23-Diagnoses_main_ICD10_D23_Other_benign_neoplasms_of_skin,20003_1140861090-Treatmentmedication_code_adalat_5mg_capsule,20003_99999-Treatmentmedication_code_Freetext_entry_unable_to_be_coded,20003_1140865872-Treatmentmedication_code_magnesium_citrate,22601_81172834-Job_coding_metal_making_or_metal_treating_process_worker_machine_operator_furnaceman,...,22617_3433-Job_SOC_coding_Public_relations_officers,6070_1-OCT_measured_right_Measurable,I9_IHD-Ischaemic_heart_disease_wide_definition,I9_ARTEMBTHR-Arterial_embolism_and_thrombosis,K25-Diagnoses_main_ICD10_K25_Gastric_ulcer,20110_8-Illnesses_of_mother_High_blood_pressure,M13_LATERALEPICOND-Lateral_epicondylitis,DM_RETINOPAT_NOS-Unclassified_diabetic_retinopathy,20003_1141189094-Treatmentmedication_code_avandamet_1mg_500mg_tablet,20118_5-Home_area_population_density_urban_or_rural_EnglandWales_Urban_less_sparse
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.397276,0.604616,0.775742,1.436307,1.412375,1.600041,0.028202,0.067765,0.40245,0.131819,...,1.999547,1.127596,0.35693,0.824782,0.082111,0.836645,0.622258,2.076279,0.688793,2.08774
ENSG00000000457,0.319545,2.279867,0.110495,1.164098,0.505677,1.277273,0.410254,0.838215,0.092569,2.099556,...,0.392008,0.354665,0.66534,0.961071,0.679878,2.025639,0.74599,0.476264,0.362441,2.131233
ENSG00000000460,0.015547,2.091471,0.068122,2.869138,4.393092,0.159562,0.062802,0.342448,0.164821,1.794037,...,0.490251,0.42036,2.021611,0.05123,0.898618,1.938927,2.192165,0.802752,2.289388,3.226133
ENSG00000000938,0.096502,0.736958,2.556985,0.117943,3.601362,0.804478,0.280109,0.552019,0.242493,0.124827,...,0.297255,1.215982,4.066878,0.578105,1.536092,2.17154,0.032683,0.334689,0.245518,0.203841
ENSG00000001036,1.668204,0.03811,0.534008,0.136528,0.112277,0.330241,0.189969,0.33502,0.403565,0.771542,...,0.180306,2.395588,0.18654,0.790372,0.136059,2.470515,1.457462,0.151217,0.06206,0.666584


# Load PheWAS catalog

In [17]:
phewas_catalog = pd.read_csv(os.path.join(constants.DATA_DIR, 'phewas-catalog.csv'), dtype={'phewas code': str})

In [18]:
phewas_catalog.shape

(215107, 9)

In [19]:
phewas_catalog[phewas_catalog['phewas code'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations


In [20]:
phewas_catalog[phewas_catalog['gene_name'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations
41,4 111710169,rs2200733,Atrial fibrillation,1950,1.527e-10,1.517,,427.21,"Atrial fibrillation, Atrial fibrillation/atria..."
49,4 111710169,rs2200733,Atrial fibrillation & flutter,2041,1.019e-09,1.481,,427.2,"Atrial fibrillation, Atrial fibrillation/atria..."
98,4,rs4698036,Gout,769,7.803e-08,0.6839,,274.1,Serum uric acid
108,4,rs4698036,Gout and other crystal arthropathies,904,1.99e-07,0.7132,,274.0,Serum uric acid
115,8 128485038,rs1447295,Prostate cancer,848,2.758e-07,1.606,,185.0,Prostate cancer


In [21]:
phewas_catalog[phewas_catalog['gene_name'].isna()].shape

(52140, 9)

In [22]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'phewas code'])

In [23]:
phewas_catalog.shape

(162967, 9)

In [24]:
phewas_catalog['gene_name'].unique().shape

(1775,)

In [25]:
phewas_catalog['phewas code'].unique().shape

(1358,)

In [26]:
phewas_catalog = phewas_catalog.assign(gene_id=phewas_catalog['gene_name'].apply(lambda x: genes_mapping_1[x] if x in genes_mapping_1 else None))

In [27]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'gene_id', 'phewas code'])

In [28]:
phewas_catalog.shape

(147970, 10)

In [29]:
phewas_catalog.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ...",ENSG00000137265
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob...",ENSG00000010704
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204


In [30]:
phewas_catalog.sort_values('phewas phenotype').head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
35306,10,rs7923609,ASCVD,166,0.008094,1.361,JMJD1C,414.2,Alkaline phosphatase,ENSG00000171988
154790,22,rs1012068,ASCVD,166,0.03597,1.292,DEPDC5,414.2,Chronic Hepatitis C infection,ENSG00000100150
72358,5 158814533,rs10045431,ASCVD,166,0.01674,0.7242,IL12B,414.2,Crohn's disease,ENSG00000113302
130720,14 87896435,rs17124581,ASCVD,166,0.03037,1.609,SPATA7,414.2,Cognitive performance,ENSG00000042317
184453,6 31912648,rs429608,ASCVD,166,0.04284,1.344,SKIV2L,414.2,Age-related macular degeneration,ENSG00000204351


# Genes in common

In [31]:
shared_gene_ids = \
    set(phewas_catalog['gene_id'].values)\
    .intersection(genes_associations.index)

In [32]:
len(shared_gene_ids)

1483

# HPO to MIM

In [33]:
hpo_to_mim = pd.read_csv(os.path.join(constants.DATA_DIR, 'hpo-to-omim-and-phecode.csv'), dtype={'phecode': str})

In [34]:
hpo_to_mim.shape

(84031, 10)

In [35]:
hpo_to_mim.head()

Unnamed: 0,term_id,name,match_available,phecode,phecode string,match_type,class,dID,disease_name,modifier
0,28,Cryptorchidism,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
1,49,Shawl scrotum,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
2,175,Cleft palate,1,749.1,Cleft palate,Exact,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
3,202,Oral cleft,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
4,204,Cleft upper lip,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O


# Load silver standard to map from UKB to MIM

In [36]:
omim_silver_standard = pd.read_csv(os.path.join(constants.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [37]:
ukb_to_mim_map = omim_silver_standard[['trait', 'pheno_mim']].dropna()

In [38]:
ukb_to_mim_map.shape

(7822, 2)

In [39]:
ukb_to_mim_map.head()

Unnamed: 0,trait,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145


# Read gwas2gene (Yanyu) results

In [40]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [41]:
gwas2gene_results_dir = '/mnt/phenomexcan/results/roc_validation/ukb_gwas2gene_results_omim_silver_standard/'

In [42]:
readRDS = robjects.r['readRDS']

In [43]:
f_files = glob(os.path.join(gwas2gene_results_dir, '*.rds'))
display(len(f_files))

if len(f_files) != len(omim_silver_standard['trait'].unique()):
    print(f'WARNING: some files are not there. {len(omim_silver_standard.trait.unique())} expected, {len(f_files)} found.')

99



In [44]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = f_gene_list

In [45]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

gwas2gene_all_genes = shared_gene_ids.intersection(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

20837

10185

905

In [46]:
pd.Series(list(gwas2gene_all_genes)).head()

0    ENSG00000129038
1    ENSG00000072401
2    ENSG00000144560
3    ENSG00000204410
4    ENSG00000125741
dtype: object

# Universe

In [47]:
from clustering.biclustering.analysis import Trait

In [48]:
_ukb_traits = []
_ukb_traits_phecodes = []
_ukb_gene_available = []

for t in ukb_to_mim_map['trait'].unique():
    t_code = Trait(t).trait_code
    if t_code not in gwas2genes_results:
        print(t_code)
        continue
    
    for g in gwas2genes_results[t_code]:
        _ukb_traits.append(t)
        _ukb_gene_available.append(g)

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [49]:
df = pd.DataFrame({'trait': _ukb_traits, 'gene': _ukb_gene_available})

In [50]:
df.shape

(20837, 2)

In [51]:
df.head()

Unnamed: 0,trait,gene
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263


# Add MIM/Phecode

In [52]:
# add mim
_tmp = pd.merge(df, ukb_to_mim_map, on='trait', how='inner')
display(_tmp.shape)
display(_tmp.head())

(1506780, 3)

Unnamed: 0,trait,gene,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108145


In [53]:
_tmp[_tmp['pheno_mim'].isna()].shape

(0, 3)

In [54]:
# mim to phecode
_tmp = pd.merge(_tmp, hpo_to_mim[['phecode', 'dID']], left_on='pheno_mim', right_on='dID', how='inner').drop(columns=['dID'])
display(_tmp.shape)
display(_tmp.head())

(28272396, 4)

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,


In [55]:
_tmp[_tmp['phecode'].isna()].shape

(4377439, 4)

In [56]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,


In [57]:
# phecode to phewas catalog
_tmp = pd.merge(_tmp, phewas_catalog[['phewas code', 'gene_id']],
                left_on=['phecode', 'gene'], right_on=['phewas code', 'gene_id'],
                how='left').drop(columns=['phewas code'])
display(_tmp.shape)

(28293613, 5)

In [58]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,,


In [59]:
_tmp[_tmp['gene_id'].isna()].shape

(28124727, 5)

In [60]:
_tmp = _tmp.drop_duplicates(subset=['trait', 'phecode', 'gene_id'])

In [61]:
_tmp.shape

(55926, 5)

In [62]:
_tmp[_tmp['gene_id'].isna()].shape

(20873, 5)

In [63]:
_tmp = _tmp.assign(true_class=_tmp['gene_id'].apply(lambda x: int(not pd.isnull(x))))

In [64]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id,true_class
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,,0
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,,0
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,,0
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,,0
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,,,0


In [65]:
_tmp['true_class'].value_counts()

1    35053
0    20873
Name: true_class, dtype: int64

### Add score

In [66]:
_genes_unstacked = genes_associations.unstack()

In [67]:
_genes_unstacked.shape

(81173070,)

In [68]:
_genes_unstacked.head()

                                                         gene_name      
20003_1141171932-Treatmentmedication_code_levetiracetam  ENSG00000000419    0.397276
                                                         ENSG00000000457    0.319545
                                                         ENSG00000000460    0.015547
                                                         ENSG00000000938    0.096502
                                                         ENSG00000001036    1.668204
dtype: float64

In [69]:
classifier_table = _tmp.set_index(['trait', 'gene']).assign(score=_genes_unstacked).drop(columns=['gene_id'])

In [70]:
classifier_table.shape

(55926, 4)

In [71]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,0,
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,0,
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,0,
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,0,
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,,0,


In [72]:
classifier_table[classifier_table['score'].isna()].shape

(3036, 4)

In [73]:
classifier_table = classifier_table.dropna(subset=['phecode'])

In [74]:
classifier_table = classifier_table.dropna(subset=['score'])

In [75]:
classifier_table.shape

(52799, 4)

In [76]:
N_TESTS = classifier_table.reset_index().drop_duplicates(subset=['trait', 'gene']).shape[0]
display(N_TESTS)

PVALUE_THRESHOLD = (0.05 / (N_TESTS))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

2206

2.2665457842248415e-05

4.236872674662613

In [77]:
classifier_table = classifier_table.assign(predicted_class=(classifier_table['score'] > ZSCORE_THRESHOLD).astype(int))

# Save classifier table

In [78]:
classifier_table = classifier_table.sort_index()

In [79]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score,predicted_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,0,5.467206,1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,292.6,0,5.467206,1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,300.1,0,5.467206,1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,401.0,0,5.467206,1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,573.5,0,5.467206,1


In [80]:
classifier_table.shape

(52799, 5)

In [81]:
classifier_table.to_csv(
    os.path.join(output_dir, 'smultixcan-en_not_fixed-classifier_data-phewas_catalog.tsv.gz'),
    sep='\t', index=False
)