In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
# from src.data import PhenoInfo, PhenoResults, get_all_tissues, get_genes
from data.multixcan_data import MXPhenoInfo, MXPhenoResults
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'mashr')

output_dir = os.path.join(constants.RESULTS_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load metadata

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load MultiXcan associations

In [5]:
genes_associations_filename = os.path.join(smultixcan_gene_association_dirs, 'smultixcan-genes_associations-zscores.pkl.xz')
display(genes_associations_filename)

genes_associations = pd.read_pickle(genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/smultixcan-genes_associations-zscores.pkl.xz'

In [6]:
# replace inf
genes_associations = genes_associations.replace(np.inf, 40)

assert not genes_associations.isin([np.inf, -np.inf]).any().any()

In [7]:
assert (genes_associations > 0).all().all()

In [8]:
display(genes_associations.shape)
display(genes_associations.head())

(22255, 4083)

Unnamed: 0_level_0,L12_EPIDERMALTHICKOTH-Other_epidermal_thickening,O42-Diagnoses_main_ICD10_O42_Premature_rupture_of_membranes,20002_1077-Noncancer_illness_code_selfreported_heart_arrhythmia,20445-Depression_possibly_related_to_childbirth,20077-Number_of_diet_questionnaires_completed,22601_91392832-Job_coding_other_work_in_this_industry_factory_hand_mate_assistant_handler_loader,I9_VTE-Venous_thromboembolism,22617_1161-Job_SOC_coding_Transport_and_distribution_managers,20002_1460-Noncancer_illness_code_selfreported_rectal_or_colon_adenomapolyps,5181-Ever_had_eye_surgery,...,20090_394-Type_of_fatoil_used_in_cooking_Unknown_soft_margarine,22617_3512-Job_SOC_coding_Aircraft_pilots_and_flight_engineers,6034-Target_heart_rate_achieved,20003_1140883066-Treatmentmedication_code_insulin_product,22601_41223241-Job_coding_accounts_and_wages_clerkassistantsupervisor_bookkeeper_cost_or_ledger_clerk_audit_assistant_budget_officer_student_loans_officer_paymaster,I82-Diagnoses_main_ICD10_I82_Other_venous_embolism_and_thrombosis,20107_12-Illnesses_of_father_Severe_depression,B07-Diagnoses_main_ICD10_B07_Viral_warts,22601_12253140-Job_coding_sports_centre_manager_riding_school_owner_sports_ground_manager_baths_manager,2664_2-Reason_for_reducing_amount_of_alcohol_drunk_Doctors_advice
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.118542,1.185152,0.506195,0.852355,0.604068,1.014925,0.874812,1.995403,1.226674,2.401157,...,0.950455,0.770204,0.788341,0.941923,1.470308,0.663953,0.270128,0.01834,2.068068,0.693219
ENSG00000000457,0.52017,1.062542,1.306725,0.989147,1.981863,0.389948,1.558053,0.780973,0.583048,0.039883,...,0.104441,0.474818,0.005264,1.766256,0.008613,0.539178,1.340549,0.745059,1.091906,0.11291
ENSG00000000460,0.181827,0.454945,0.422742,0.212844,1.121509,0.594248,3.425656,0.828176,0.877009,1.024421,...,0.181627,0.614678,1.237589,0.212545,1.021029,0.298612,1.556284,0.44378,0.261719,0.060068
ENSG00000000938,0.289141,0.212541,1.170981,0.735132,0.074349,0.626647,2.42166,2.716722,0.655375,0.536942,...,0.290044,0.717265,0.368518,1.496889,0.045447,0.018784,1.238725,0.876929,0.675103,3.171424
ENSG00000000971,0.465188,1.051131,0.797415,0.382712,1.04571,1.470092,1.435553,0.311438,1.213439,1.411339,...,1.691603,3.562145,2.040159,0.281551,0.933999,0.940077,0.14434,0.594995,0.00031,0.411814


# Load PheWAS catalog

In [9]:
phewas_catalog = pd.read_csv(os.path.join(constants.DATA_DIR, 'phewas-catalog.csv'), dtype={'phewas code': str})

In [10]:
phewas_catalog.shape

(215107, 9)

In [11]:
phewas_catalog[phewas_catalog['phewas code'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations


In [12]:
phewas_catalog[phewas_catalog['gene_name'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations
41,4 111710169,rs2200733,Atrial fibrillation,1950,1.527e-10,1.517,,427.21,"Atrial fibrillation, Atrial fibrillation/atria..."
49,4 111710169,rs2200733,Atrial fibrillation & flutter,2041,1.019e-09,1.481,,427.2,"Atrial fibrillation, Atrial fibrillation/atria..."
98,4,rs4698036,Gout,769,7.803e-08,0.6839,,274.1,Serum uric acid
108,4,rs4698036,Gout and other crystal arthropathies,904,1.99e-07,0.7132,,274.0,Serum uric acid
115,8 128485038,rs1447295,Prostate cancer,848,2.758e-07,1.606,,185.0,Prostate cancer


In [13]:
phewas_catalog[phewas_catalog['gene_name'].isna()].shape

(52140, 9)

In [14]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'phewas code'])

In [15]:
phewas_catalog.shape

(162967, 9)

In [16]:
phewas_catalog['gene_name'].unique().shape

(1775,)

In [17]:
phewas_catalog['phewas code'].unique().shape

(1358,)

In [18]:
phewas_catalog = phewas_catalog.assign(gene_id=phewas_catalog['gene_name'].apply(lambda x: genes_mapping_1[x] if x in genes_mapping_1 else None))

In [19]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'gene_id', 'phewas code'])

In [20]:
phewas_catalog.shape

(147970, 10)

In [21]:
phewas_catalog.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ...",ENSG00000137265
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob...",ENSG00000010704
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204


In [22]:
phewas_catalog.sort_values('phewas phenotype').head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
35306,10,rs7923609,ASCVD,166,0.008094,1.361,JMJD1C,414.2,Alkaline phosphatase,ENSG00000171988
154790,22,rs1012068,ASCVD,166,0.03597,1.292,DEPDC5,414.2,Chronic Hepatitis C infection,ENSG00000100150
72358,5 158814533,rs10045431,ASCVD,166,0.01674,0.7242,IL12B,414.2,Crohn's disease,ENSG00000113302
130720,14 87896435,rs17124581,ASCVD,166,0.03037,1.609,SPATA7,414.2,Cognitive performance,ENSG00000042317
184453,6 31912648,rs429608,ASCVD,166,0.04284,1.344,SKIV2L,414.2,Age-related macular degeneration,ENSG00000204351


# Genes in common

In [23]:
shared_gene_ids = \
    set(phewas_catalog['gene_id'].values)\
    .intersection(genes_associations.index)

In [24]:
len(shared_gene_ids)

1589

# HPO to MIM

In [25]:
hpo_to_mim = pd.read_csv(os.path.join(constants.DATA_DIR, 'hpo-to-omim-and-phecode.csv'), dtype={'phecode': str})

In [26]:
hpo_to_mim.shape

(84031, 10)

In [27]:
hpo_to_mim.head()

Unnamed: 0,term_id,name,match_available,phecode,phecode string,match_type,class,dID,disease_name,modifier
0,28,Cryptorchidism,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
1,49,Shawl scrotum,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
2,175,Cleft palate,1,749.1,Cleft palate,Exact,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
3,202,Oral cleft,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
4,204,Cleft upper lip,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O


# Load silver standard to map from UKB to MIM

In [28]:
omim_silver_standard = pd.read_csv(os.path.join(constants.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [29]:
ukb_to_mim_map = omim_silver_standard[['trait', 'pheno_mim']].dropna()

In [30]:
ukb_to_mim_map.shape

(7822, 2)

In [31]:
ukb_to_mim_map.head()

Unnamed: 0,trait,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145


# Read gwas2gene (Yanyu) results

In [32]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [33]:
gwas2gene_results_dir = '/mnt/phenomexcan/results/roc_validation/ukb_gwas2gene_results_omim_silver_standard/'

In [34]:
readRDS = robjects.r['readRDS']

In [35]:
f_files = glob(os.path.join(gwas2gene_results_dir, '*.rds'))
display(len(f_files))

if len(f_files) != len(omim_silver_standard['trait'].unique()):
    print(f'WARNING: some files are not there. {len(omim_silver_standard.trait.unique())} expected, {len(f_files)} found.')

99



In [36]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = f_gene_list

In [37]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

gwas2gene_all_genes = shared_gene_ids.intersection(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

20837

10185

956

In [38]:
pd.Series(list(gwas2gene_all_genes)).head()

0    ENSG00000152254
1    ENSG00000145246
2    ENSG00000170011
3    ENSG00000204264
4    ENSG00000204463
dtype: object

# Universe

In [39]:
from clustering.biclustering.analysis import Trait

In [40]:
_ukb_traits = []
_ukb_traits_phecodes = []
_ukb_gene_available = []

for t in ukb_to_mim_map['trait'].unique():
    t_code = Trait(t).trait_code
    if t_code not in gwas2genes_results:
        print(t_code)
        continue
    
    for g in gwas2genes_results[t_code]:
        _ukb_traits.append(t)
        _ukb_gene_available.append(g)

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [41]:
df = pd.DataFrame({'trait': _ukb_traits, 'gene': _ukb_gene_available})

In [42]:
df.shape

(20837, 2)

In [48]:
df.head()

Unnamed: 0,trait,gene
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263


# Add MIM/Phecode

In [49]:
# add mim
_tmp = pd.merge(df, ukb_to_mim_map, on='trait', how='inner')
display(_tmp.shape)
display(_tmp.head())

(1506780, 3)

Unnamed: 0,trait,gene,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108145


In [50]:
_tmp[_tmp['pheno_mim'].isna()].shape

(0, 3)

In [51]:
# mim to phecode
_tmp = pd.merge(_tmp, hpo_to_mim[['phecode', 'dID']].dropna(), left_on='pheno_mim', right_on='dID', how='inner').drop(columns=['dID'])
display(_tmp.shape)
display(_tmp.head())

(23894957, 4)

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [52]:
_tmp[_tmp['phecode'].isna()].shape

(0, 4)

In [53]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [54]:
# phecode to phewas catalog
_tmp = pd.merge(_tmp, phewas_catalog[['phewas code', 'gene_id']],
                left_on=['phecode', 'gene'], right_on=['phewas code', 'gene_id'],
                how='left').drop(columns=['phewas code'])
display(_tmp.shape)

(23916174, 5)

In [68]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3,


In [69]:
_tmp[_tmp['gene_id'].isna()].shape

(23747288, 5)

In [70]:
_tmp = _tmp.drop_duplicates(subset=['trait', 'gene', 'gene_id'])

In [71]:
_tmp.shape

(23044, 5)

In [72]:
_tmp[_tmp['gene_id'].isna()].shape

(20837, 5)

In [77]:
_tmp = _tmp.assign(true_class=_tmp['gene_id'].apply(lambda x: int(not pd.isnull(x))))

In [78]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id,true_class
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,,0
29,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089,101800,751.12,,0
58,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336,101800,751.12,,0
87,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401,101800,751.12,,0
116,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263,101800,751.12,,0


In [79]:
_tmp['true_class'].value_counts()

0    20837
1     2207
Name: true_class, dtype: int64

In [80]:
_tmp['true_class'].value_counts().sum()

23044

In [81]:
20837 / 23044

0.9042266967540358

In [82]:
2207 / 23044

0.09577330324596424

### Add score

In [83]:
_genes_unstacked = genes_associations.unstack()

In [84]:
_genes_unstacked.shape

(90867165,)

In [85]:
_genes_unstacked.head()

                                                  gene_name      
L12_EPIDERMALTHICKOTH-Other_epidermal_thickening  ENSG00000000419    0.118542
                                                  ENSG00000000457    0.520170
                                                  ENSG00000000460    0.181827
                                                  ENSG00000000938    0.289141
                                                  ENSG00000000971    0.465188
dtype: float64

In [86]:
classifier_table = _tmp.set_index(['trait', 'gene']).assign(score=_genes_unstacked).drop(columns=['gene_id'])

In [87]:
classifier_table.shape

(23044, 4)

In [88]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,0,0.643603
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089,101800,751.12,0,0.059318
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336,101800,751.12,0,0.843093
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401,101800,751.12,0,3.236173
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263,101800,751.12,0,0.474822


In [89]:
classifier_table[classifier_table['score'].isna()].shape

(1721, 4)

In [90]:
classifier_table = classifier_table.dropna(subset=['phecode'])

In [91]:
classifier_table = classifier_table.dropna(subset=['score'])

In [92]:
classifier_table.shape

(21323, 4)

In [93]:
N_TESTS = classifier_table.reset_index().drop_duplicates(subset=['trait', 'gene']).shape[0]
display(N_TESTS)

PVALUE_THRESHOLD = (0.05 / (N_TESTS))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

19119

2.615199539724881e-06

4.69893633607862

In [94]:
classifier_table = classifier_table.assign(predicted_class=(classifier_table['score'] > ZSCORE_THRESHOLD).astype(int))

In [95]:
classifier_table['true_class'].value_counts()

0    19119
1     2204
Name: true_class, dtype: int64

In [100]:
classifier_table['true_class'].value_counts().sum()

21323

# Save classifier table

In [96]:
classifier_table = classifier_table.sort_index()

In [97]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score,predicted_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,0,5.986574,1
1200-Sleeplessness_insomnia,ENSG00000003756,121300,296.2,0,5.540858,1
1200-Sleeplessness_insomnia,ENSG00000004534,121300,296.2,0,5.270682,1
1200-Sleeplessness_insomnia,ENSG00000004838,121300,296.2,0,1.168578,0
1200-Sleeplessness_insomnia,ENSG00000004897,121300,296.2,0,0.042414,0


In [98]:
classifier_table.shape

(21323, 5)

In [99]:
classifier_table.to_csv(
    os.path.join(output_dir, 'smultixcan-mashr-classifier_data-phewas_catalog.tsv.gz'),
    sep='\t', index=False
)