In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
# from src.data import PhenoInfo, PhenoResults, get_all_tissues, get_genes
from data.multixcan_data import MXPhenoInfo, MXPhenoResults
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'mashr')

output_dir = os.path.join(constants.RESULTS_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load metadata

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load fastENLOC results

In [5]:
genes_associations_filename = os.path.join(genes_associations_dir, 'fastenloc', 'fastenloc-torus-genes_rcp.pkl.xz')
display(genes_associations_filename)

genes_associations = pd.read_pickle(genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/fastenloc/fastenloc-torus-genes_rcp.pkl.xz'

In [6]:
genes_associations = genes_associations.rename(index=genes_mapping_1)

In [7]:
genes_associations.shape

(22233, 4083)

In [8]:
genes_associations.head(5)

Unnamed: 0_level_0,L12_NAILDIS-Nail_disorders,J69-Diagnoses_main_ICD10_J69_Pneumonitis_due_to_solids_and_liquids,R07-Diagnoses_main_ICD10_R07_Pain_in_throat_and_chest,M46-Diagnoses_main_ICD10_M46_Other_inflammatory_spondylopathies,20003_2038460150-Treatmentmedication_code_paracetamol,E10-Diagnoses_main_ICD10_E10_Insulindependent_diabetes_mellitus,5134_raw-6mm_strong_meridian_left,E86-Diagnoses_main_ICD10_E86_Volume_depletion,L12_ERYTHEMATOUSOTH-Other_erythematous_conditions,K83-Diagnoses_main_ICD10_K83_Other_diseases_of_biliary_tract,...,20090_394-Type_of_fatoil_used_in_cooking_Unknown_soft_margarine,22617_3512-Job_SOC_coding_Aircraft_pilots_and_flight_engineers,6034-Target_heart_rate_achieved,20003_1140883066-Treatmentmedication_code_insulin_product,22601_41223241-Job_coding_accounts_and_wages_clerkassistantsupervisor_bookkeeper_cost_or_ledger_clerk_audit_assistant_budget_officer_student_loans_officer_paymaster,I82-Diagnoses_main_ICD10_I82_Other_venous_embolism_and_thrombosis,20107_12-Illnesses_of_father_Severe_depression,B07-Diagnoses_main_ICD10_B07_Viral_warts,22601_12253140-Job_coding_sports_centre_manager_riding_school_owner_sports_ground_manager_baths_manager,2664_2-Reason_for_reducing_amount_of_alcohol_drunk_Doctors_advice
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000121410,1.5153e-08,2.216225e-08,0.005937,5.48296e-10,0.006081,0.000458,0.006905,2.241375e-10,1.7531e-09,2.7742e-05,...,0.000292,1.49871e-10,6.837e-06,0.000714,1.50342e-10,5.317725e-08,0.0005242997,1.807621e-08,7.5456e-08,0.000154
ENSG00000148584,2.796517e-11,1.696862e-10,0.00197,1.021794e-10,0.010776,6e-06,0.00585,7.8333e-11,2.507511e-10,3.895151e-10,...,0.000225,1.869519e-10,4.08106e-11,0.00027,2.78311e-11,1.083806e-10,3.367239e-07,5.118762e-11,1.050704e-10,0.000449
ENSG00000175899,1.92698e-08,7.8582e-09,0.000947,5.466e-11,0.003225,2.5e-05,0.003633,4.74549e-10,5.6103e-10,9.63365e-06,...,8.1e-05,1.60843e-10,3.14128e-06,0.000401,1.096541e-10,3.252662e-08,0.00056958,1.128942e-08,1.132e-08,0.00013
ENSG00000166535,1.14227e-09,3.4926e-09,0.000428,1.4761e-10,0.004078,0.000247,0.018624,3.0557e-10,3.5895e-10,3.79213e-06,...,0.000197,2.7592e-10,7.272498e-06,0.000148,1.64236e-11,8.91271e-08,0.000874904,1.3268e-10,2.2358e-09,5e-06
ENSG00000184389,1.99748e-08,5.515e-10,0.002141,2.41272e-09,0.004778,0.0002,0.14135,1.314787e-10,1.59708e-09,2.149724e-05,...,0.000311,2.357997e-10,1.82613e-09,0.000989,6.6555e-11,1.471335e-10,0.0006330287,4.799799e-09,1.11504e-07,5e-05


In [9]:
assert genes_associations.isna().sum().sum() == 0

In [10]:
assert not genes_associations.isin([np.inf, -np.inf]).any().any()

In [11]:
_all_values = pd.Series(genes_associations.values.flatten())

In [12]:
_tmp = _all_values.sort_values(ascending=True)

In [13]:
display(_tmp.head())

63673277    0.0
13822236    0.0
13822237    0.0
13822238    0.0
13822239    0.0
dtype: float64

In [14]:
_tmp = _all_values.sort_values(ascending=False)

In [15]:
display(_tmp.head())

84487107    2.661831
29863101    2.531200
41614643    2.484566
53648952    2.129925
53649340    2.116626
dtype: float64

# Load PheWAS catalog

In [16]:
phewas_catalog = pd.read_csv(os.path.join(constants.DATA_DIR, 'phewas-catalog.csv'), dtype={'phewas code': str})

In [17]:
phewas_catalog.shape

(215107, 9)

In [18]:
phewas_catalog[phewas_catalog['phewas code'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations


In [19]:
phewas_catalog[phewas_catalog['gene_name'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations
41,4 111710169,rs2200733,Atrial fibrillation,1950,1.527e-10,1.517,,427.21,"Atrial fibrillation, Atrial fibrillation/atria..."
49,4 111710169,rs2200733,Atrial fibrillation & flutter,2041,1.019e-09,1.481,,427.2,"Atrial fibrillation, Atrial fibrillation/atria..."
98,4,rs4698036,Gout,769,7.803e-08,0.6839,,274.1,Serum uric acid
108,4,rs4698036,Gout and other crystal arthropathies,904,1.99e-07,0.7132,,274.0,Serum uric acid
115,8 128485038,rs1447295,Prostate cancer,848,2.758e-07,1.606,,185.0,Prostate cancer


In [20]:
phewas_catalog[phewas_catalog['gene_name'].isna()].shape

(52140, 9)

In [21]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'phewas code'])

In [22]:
phewas_catalog.shape

(162967, 9)

In [23]:
phewas_catalog['gene_name'].unique().shape

(1775,)

In [24]:
phewas_catalog['phewas code'].unique().shape

(1358,)

In [25]:
phewas_catalog = phewas_catalog.assign(gene_id=phewas_catalog['gene_name'].apply(lambda x: genes_mapping_1[x] if x in genes_mapping_1 else None))

In [26]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'gene_id', 'phewas code'])

In [27]:
phewas_catalog.shape

(147970, 10)

In [28]:
phewas_catalog.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ...",ENSG00000137265
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob...",ENSG00000010704
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204


In [29]:
phewas_catalog.sort_values('phewas phenotype').head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
35306,10,rs7923609,ASCVD,166,0.008094,1.361,JMJD1C,414.2,Alkaline phosphatase,ENSG00000171988
154790,22,rs1012068,ASCVD,166,0.03597,1.292,DEPDC5,414.2,Chronic Hepatitis C infection,ENSG00000100150
72358,5 158814533,rs10045431,ASCVD,166,0.01674,0.7242,IL12B,414.2,Crohn's disease,ENSG00000113302
130720,14 87896435,rs17124581,ASCVD,166,0.03037,1.609,SPATA7,414.2,Cognitive performance,ENSG00000042317
184453,6 31912648,rs429608,ASCVD,166,0.04284,1.344,SKIV2L,414.2,Age-related macular degeneration,ENSG00000204351


# Genes in common

In [30]:
shared_gene_ids = \
    set(phewas_catalog['gene_id'].values)\
    .intersection(genes_associations.index)

In [31]:
len(shared_gene_ids)

1594

# HPO to MIM

In [32]:
hpo_to_mim = pd.read_csv(os.path.join(constants.DATA_DIR, 'hpo-to-omim-and-phecode.csv'), dtype={'phecode': str})

In [33]:
hpo_to_mim.shape

(84031, 10)

In [34]:
hpo_to_mim.head()

Unnamed: 0,term_id,name,match_available,phecode,phecode string,match_type,class,dID,disease_name,modifier
0,28,Cryptorchidism,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
1,49,Shawl scrotum,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
2,175,Cleft palate,1,749.1,Cleft palate,Exact,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
3,202,Oral cleft,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
4,204,Cleft upper lip,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O


# Load silver standard to map from UKB to MIM

In [35]:
omim_silver_standard = pd.read_csv(os.path.join(constants.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [36]:
ukb_to_mim_map = omim_silver_standard[['trait', 'pheno_mim']].dropna()

In [37]:
ukb_to_mim_map.shape

(7822, 2)

In [38]:
ukb_to_mim_map.head()

Unnamed: 0,trait,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145


# Read gwas2gene (Yanyu) results

In [39]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [40]:
gwas2gene_results_dir = '/mnt/phenomexcan/results/roc_validation/ukb_gwas2gene_results_omim_silver_standard/'

In [41]:
readRDS = robjects.r['readRDS']

In [42]:
f_files = glob(os.path.join(gwas2gene_results_dir, '*.rds'))
display(len(f_files))

if len(f_files) != len(omim_silver_standard['trait'].unique()):
    print(f'WARNING: some files are not there. {len(omim_silver_standard.trait.unique())} expected, {len(f_files)} found.')

99



In [43]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = f_gene_list

In [44]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

gwas2gene_all_genes = shared_gene_ids.intersection(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

20837

10185

959

In [45]:
pd.Series(list(gwas2gene_all_genes)).head()

0    ENSG00000204060
1    ENSG00000091490
2    ENSG00000278318
3    ENSG00000165029
4    ENSG00000155897
dtype: object

# Universe

In [46]:
from clustering.biclustering.analysis import Trait

In [47]:
_ukb_traits = []
_ukb_traits_phecodes = []
_ukb_gene_available = []

for t in ukb_to_mim_map['trait'].unique():
    t_code = Trait(t).trait_code
    if t_code not in gwas2genes_results:
        print(t_code)
        continue
    
    for g in gwas2genes_results[t_code]:
        _ukb_traits.append(t)
        _ukb_gene_available.append(g)

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [48]:
df = pd.DataFrame({'trait': _ukb_traits, 'gene': _ukb_gene_available})

In [49]:
df.shape

(20837, 2)

In [50]:
df.head()

Unnamed: 0,trait,gene
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263


# Add MIM/Phecode

In [51]:
# add mim
_tmp = pd.merge(df, ukb_to_mim_map, on='trait', how='inner')
display(_tmp.shape)
display(_tmp.head())

(1506780, 3)

Unnamed: 0,trait,gene,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108145


In [52]:
_tmp[_tmp['pheno_mim'].isna()].shape

(0, 3)

In [53]:
# mim to phecode
_tmp = pd.merge(_tmp, hpo_to_mim[['phecode', 'dID']].dropna(), left_on='pheno_mim', right_on='dID', how='inner').drop(columns=['dID'])
display(_tmp.shape)
display(_tmp.head())

(23894957, 4)

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [54]:
_tmp[_tmp['phecode'].isna()].shape

(0, 4)

In [55]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [56]:
# phecode to phewas catalog
_tmp = pd.merge(_tmp, phewas_catalog[['phewas code', 'gene_id']],
                left_on=['phecode', 'gene'], right_on=['phewas code', 'gene_id'],
                how='left').drop(columns=['phewas code'])
display(_tmp.shape)

(23916174, 5)

In [57]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3,


In [58]:
_tmp[_tmp['gene_id'].isna()].shape

(23747288, 5)

In [59]:
_tmp = _tmp.drop_duplicates(subset=['trait', 'gene', 'gene_id'])

In [60]:
_tmp.shape

(23044, 5)

In [61]:
_tmp[_tmp['gene_id'].isna()].shape

(20837, 5)

In [62]:
_tmp = _tmp.assign(true_class=_tmp['gene_id'].apply(lambda x: int(not pd.isnull(x))))

In [63]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id,true_class
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,,0
29,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089,101800,751.12,,0
58,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336,101800,751.12,,0
87,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401,101800,751.12,,0
116,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263,101800,751.12,,0


In [64]:
_tmp['true_class'].value_counts()

0    20837
1     2207
Name: true_class, dtype: int64

### Add score

In [65]:
_genes_unstacked = genes_associations.unstack()

In [66]:
_genes_unstacked.shape

(90777339,)

In [67]:
_genes_unstacked.head()

                            gene_name      
L12_NAILDIS-Nail_disorders  ENSG00000121410    1.515300e-08
                            ENSG00000148584    2.796517e-11
                            ENSG00000175899    1.926980e-08
                            ENSG00000166535    1.142270e-09
                            ENSG00000184389    1.997480e-08
dtype: float64

In [68]:
classifier_table = _tmp.set_index(['trait', 'gene']).assign(score=_genes_unstacked)

In [69]:
classifier_table.shape

(23044, 5)

In [70]:
classifier_table = classifier_table.assign(predicted_class=(classifier_table['score'] > 0.1).astype(int))

In [71]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,gene_id,true_class,score,predicted_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,,0,1.7728e-11,0
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089,101800,751.12,,0,1.48e-10,0
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336,101800,751.12,,0,1.577082e-06,0
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401,101800,751.12,,0,9.199e-08,0
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263,101800,751.12,,0,5.82357e-07,0


In [72]:
classifier_table[classifier_table['score'].isna()].shape

(1658, 6)

In [73]:
classifier_table = classifier_table.dropna(subset=['phecode'])

In [74]:
classifier_table = classifier_table.dropna(subset=['score'])

In [75]:
classifier_table.shape

(21386, 6)

In [77]:
classifier_table['true_class'].value_counts()

0    19179
1     2207
Name: true_class, dtype: int64

# Save classifier table

In [78]:
classifier_table = classifier_table.sort_index()

In [79]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,gene_id,true_class,score,predicted_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,,0,0.02563019,0
1200-Sleeplessness_insomnia,ENSG00000003756,121300,296.2,,0,0.07435,0
1200-Sleeplessness_insomnia,ENSG00000004534,121300,296.2,,0,0.420816,1
1200-Sleeplessness_insomnia,ENSG00000004838,121300,296.2,,0,0.0004417,0
1200-Sleeplessness_insomnia,ENSG00000004897,121300,296.2,,0,1.116773e-09,0


In [80]:
classifier_table.shape

(21386, 6)

In [81]:
classifier_table.to_csv(
    os.path.join(output_dir, 'fastenloc-torus-classifier_data-phewas_catalog.tsv.gz'),
    sep='\t', index=False
)