In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import pairwise_distances

import settings as conf

In [3]:
# genes_associations_dir = os.path.join(conf.PREPROCESSED_BASED_DIR, 'gene_associations')
# smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'mashr')

output_dir = os.path.join(conf.DELIVERABLES_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

In [4]:
RCP_CUTOFF = 0.10

# Load gene mappings

In [5]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load MultiXcan associations

In [6]:
smultixcan_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-zscores.pkl.xz')
display(smultixcan_genes_associations_filename)

smultixcan_genes_associations = pd.read_pickle(smultixcan_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.pkl.xz'

In [7]:
display(smultixcan_genes_associations.shape)
display(smultixcan_genes_associations.head())

(22515, 4091)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


# Load fastENLOC results

In [8]:
fastenloc_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, 'fastenloc-torus-rcp.pkl.xz')
display(fastenloc_genes_associations_filename)

fastenloc_genes_associations = pd.read_pickle(fastenloc_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [9]:
fastenloc_genes_associations.shape

(38062, 4091)

In [10]:
fastenloc_genes_associations.head(5)

Unnamed: 0_level_0,O46-Diagnoses_main_ICD10_O46_Antepartum_haemorrhage_not_elsewhere_classified,K30-Diagnoses_main_ICD10_K30_Dyspepsia,2907-Ever_stopped_smoking_for_6_months,H7_DIPLOPIA-Diplopia,1538_0-Major_dietary_changes_in_the_last_5_years_No,5663-Length_of_longest_manicirritable_episode,20002_1538-Noncancer_illness_code_selfreported_arthritis_nos,S30-Diagnoses_main_ICD10_S30_Superficial_injury_of_abdomen_lower_back_and_pelvis,24010_raw-Inverse_distance_to_the_nearest_road,3143_raw-Ankle_spacing_width,...,2237-Plays_computer_games,20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,20002_1508-Noncancer_illness_code_selfreported_jaundice_unknown_cause,20003_1140881882-Treatmentmedication_code_timoptol_025_eye_drops,22601_71253330-Job_coding_merchandiser_window_dresser,23112_raw-Leg_fat_mass_right,20003_1140861778-Treatmentmedication_code_dipyridamole,20003_1199-Treatmentmedication_code_food_supplementplantherbal_extract,1309-Fresh_fruit_intake,100920_2105-Type_milk_consumed_soya_with_calcium
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,0.001213,,,,,,...,,,,,,,,,0.002131,
ENSG00000000457,,,,,0.001873,,,,,0.01772,...,0.068852,,,,,0.003383,,,0.009195,
ENSG00000000460,,,,,0.00262,,,0.000207,,0.000105,...,0.093284,,,,,0.00879,,,0.003105,
ENSG00000000938,,,,,0.002928,,,,0.000762,0.012773,...,0.0043,,,0.000424,,0.000612,,,0.00324,
ENSG00000000971,,,,,0.002858,,,,,0.019304,...,0.005419,,,,,0.007427,,,0.004804,


# Genes in common between S-MultiXcan and fastENLOC

In [11]:
common_genes = fastenloc_genes_associations.index.intersection(smultixcan_genes_associations.index)
display(common_genes)

Index(['ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460',
       'ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036',
       'ENSG00000001084', 'ENSG00000001167', 'ENSG00000001460',
       'ENSG00000001461',
       ...
       'ENSG00000284240', 'ENSG00000284308', 'ENSG00000284395',
       'ENSG00000284413', 'ENSG00000284418', 'ENSG00000284430',
       'ENSG00000284452', 'ENSG00000284513', 'ENSG00000284526',
       'ENSG00000284552'],
      dtype='object', length=22471)

# Load PheWAS catalog

In [12]:
phewas_catalog = pd.read_csv(os.path.join(conf.DATA_DIR, 'phewas-catalog.csv.gz'), dtype={'phewas code': str})

In [13]:
phewas_catalog.shape

(215107, 9)

In [14]:
phewas_catalog[phewas_catalog['phewas code'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations


In [15]:
phewas_catalog[phewas_catalog['gene_name'].isna()].head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations
41,4 111710169,rs2200733,Atrial fibrillation,1950,1.527e-10,1.517,,427.21,"Atrial fibrillation, Atrial fibrillation/atria..."
49,4 111710169,rs2200733,Atrial fibrillation & flutter,2041,1.019e-09,1.481,,427.2,"Atrial fibrillation, Atrial fibrillation/atria..."
98,4,rs4698036,Gout,769,7.803e-08,0.6839,,274.1,Serum uric acid
108,4,rs4698036,Gout and other crystal arthropathies,904,1.99e-07,0.7132,,274.0,Serum uric acid
115,8 128485038,rs1447295,Prostate cancer,848,2.758e-07,1.606,,185.0,Prostate cancer


In [16]:
phewas_catalog[phewas_catalog['gene_name'].isna()].shape

(52140, 9)

In [17]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'phewas code'])

In [18]:
phewas_catalog.shape

(162967, 9)

In [19]:
phewas_catalog['gene_name'].unique().shape

(1775,)

In [20]:
phewas_catalog['phewas code'].unique().shape

(1358,)

In [21]:
phewas_catalog = phewas_catalog.assign(gene_id=phewas_catalog['gene_name'].apply(lambda x: genes_mapping_1[x] if x in genes_mapping_1 else None))

In [22]:
phewas_catalog = phewas_catalog.dropna(subset=['gene_name', 'gene_id', 'phewas code'])

In [23]:
phewas_catalog.shape

(147970, 10)

In [24]:
phewas_catalog.head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
0,19 45395619,rs2075650,Alzheimer's disease,737,5.237e-28,2.41,TOMM40,290.11,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
1,19 45395619,rs2075650,Dementias,1170,2.409e-26,2.114,TOMM40,290.1,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204
2,6 396321,rs12203592,Actinic keratosis,2505,4.1409999999999996e-26,1.691,IRF4,702.1,"Eye color, Hair color, Freckling, Progressive ...",ENSG00000137265
3,6 26093141,rs1800562,Iron metabolism disorder,40,3.409e-25,12.27,HFE,275.1,"Mean corpuscular hemoglobin, Glycated hemoglob...",ENSG00000010704
4,19 45395619,rs2075650,Delirium dementia and amnestic disorders,1566,8.027e-24,1.841,TOMM40,290.0,"Alzheimer's disease, Alzheimer's disease bioma...",ENSG00000130204


In [25]:
phewas_catalog.sort_values('phewas phenotype').head()

Unnamed: 0,chromosome,snp,phewas phenotype,cases,p-value,odds-ratio,gene_name,phewas code,gwas-associations,gene_id
35306,10,rs7923609,ASCVD,166,0.008094,1.361,JMJD1C,414.2,Alkaline phosphatase,ENSG00000171988
154790,22,rs1012068,ASCVD,166,0.03597,1.292,DEPDC5,414.2,Chronic Hepatitis C infection,ENSG00000100150
72358,5 158814533,rs10045431,ASCVD,166,0.01674,0.7242,IL12B,414.2,Crohn's disease,ENSG00000113302
130720,14 87896435,rs17124581,ASCVD,166,0.03037,1.609,SPATA7,414.2,Cognitive performance,ENSG00000042317
184453,6 31912648,rs429608,ASCVD,166,0.04284,1.344,SKIV2L,414.2,Age-related macular degeneration,ENSG00000204351


# Genes in common

In [26]:
shared_gene_ids = \
    set(phewas_catalog['gene_id'].values)\
    .intersection(common_genes)

In [27]:
len(shared_gene_ids)

1592

# HPO to MIM

In [28]:
hpo_to_mim = pd.read_csv(os.path.join(conf.DATA_DIR, 'hpo-to-omim-and-phecode.csv.gz'), dtype={'phecode': str})

In [29]:
hpo_to_mim.shape

(84031, 10)

In [30]:
hpo_to_mim.head()

Unnamed: 0,term_id,name,match_available,phecode,phecode string,match_type,class,dID,disease_name,modifier
0,28,Cryptorchidism,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
1,49,Shawl scrotum,1,751.12,Congenital anomalies of male genital organs,General,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
2,175,Cleft palate,1,749.1,Cleft palate,Exact,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
3,202,Oral cleft,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O
4,204,Cleft upper lip,1,749.1,Cleft palate,Broader,Congenital,100050,"100050 AARSKOG SYNDROME, AUTOSOMAL DOMINANT",O


# Load silver standard to map from UKB to MIM

In [31]:
omim_silver_standard = pd.read_csv(os.path.join(conf.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [32]:
ukb_to_mim_map = omim_silver_standard[['trait', 'pheno_mim']].dropna()

In [33]:
ukb_to_mim_map.shape

(7822, 2)

In [34]:
ukb_to_mim_map.head()

Unnamed: 0,trait,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145


# Read gwas2gene results

In [35]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [36]:
readRDS = robjects.r['readRDS']

In [37]:
f_files = glob(os.path.join(conf.OMIM_SILVER_STANDARD_GWAS_TO_GENE_DIR, '*.rds'))
display(len(f_files))

if len(f_files) != len(omim_silver_standard['trait'].unique()):
    print(f'WARNING: some files are not there. {len(omim_silver_standard.trait.unique())} expected, {len(f_files)} found.')

99



In [38]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = f_gene_list

In [39]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

gwas2gene_all_genes = shared_gene_ids.intersection(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

20837

10185

958

In [40]:
pd.Series(list(gwas2gene_all_genes)).head()

0    ENSG00000134061
1    ENSG00000150893
2    ENSG00000183273
3    ENSG00000183813
4    ENSG00000162104
dtype: object

# Universe

In [41]:
from entity import Trait

In [42]:
_ukb_traits = []
_ukb_traits_phecodes = []
_ukb_gene_available = []

for t in ukb_to_mim_map['trait'].unique():
    t_code = Trait(full_code=t).code
    if t_code not in gwas2genes_results:
        print(t_code)
        continue
    
    for g in gwas2genes_results[t_code]:
        _ukb_traits.append(t)
        _ukb_gene_available.append(g)

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [43]:
df = pd.DataFrame({'trait': _ukb_traits, 'gene': _ukb_gene_available})

In [44]:
df.shape

(20837, 2)

In [45]:
df.drop_duplicates().shape

(20837, 2)

In [46]:
df.head()

Unnamed: 0,trait,gene
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263


# Add MIM/Phecode

In [47]:
# add mim
_tmp = pd.merge(df, ukb_to_mim_map, on='trait', how='inner')
display(_tmp.shape)
display(_tmp.head())

(1506780, 3)

Unnamed: 0,trait,gene,pheno_mim
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,102500
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,105830
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108120
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,108145


In [48]:
_tmp[_tmp['pheno_mim'].isna()].shape

(0, 3)

In [49]:
# mim to phecode
_tmp = pd.merge(_tmp, hpo_to_mim[['phecode', 'dID']].dropna(), left_on='pheno_mim', right_on='dID', how='inner').drop(columns=['dID'])
display(_tmp.shape)
display(_tmp.head())

(23894957, 4)

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [50]:
_tmp[_tmp['phecode'].isna()].shape

(0, 4)

In [51]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3


In [52]:
# phecode to phewas catalog
_tmp = pd.merge(_tmp, phewas_catalog[['phewas code', 'gene_id']],
                left_on=['phecode', 'gene'], right_on=['phewas code', 'gene_id'],
                how='left').drop(columns=['phewas code'])
display(_tmp.shape)

(23916174, 5)

In [53]:
_tmp.head()

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,257.1,
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,331.1,
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,749.2,
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,526.3,


In [54]:
_tmp[_tmp['gene_id'].isna()].shape

(23747359, 5)

In [55]:
_tmp = _tmp.drop_duplicates(subset=['trait', 'gene', 'gene_id'])

In [56]:
_tmp.shape

(23042, 5)

In [57]:
_tmp.head(30)

Unnamed: 0,trait,gene,pheno_mim,phecode,gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000012504,101800,751.12,
29,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075089,101800,751.12,
58,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000075336,101800,751.12,
87,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000078401,101800,751.12,
116,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090263,101800,751.12,
145,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000090266,101800,751.12,
174,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000095951,101800,751.12,
203,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000111647,101800,751.12,
232,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000111863,101800,751.12,
261,M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000112137,101800,751.12,


In [58]:
_tmp[_tmp['gene_id'].isna()].shape

(20837, 5)

In [59]:
def _assign_true_class(x):
    tc = ~pd.isnull(x['gene_id'])
    idx = [0]
    if tc.shape[0] > 1 and tc.any():
        idx = np.where(tc)[0]
    return pd.Series({
        'pheno_mim': ', '.join(x.iloc[idx]['pheno_mim'].astype(str)),
        'phecode': ', '.join(x.iloc[idx]['phecode'].astype(str)),
        'true_class': int(tc.any()),
    })

In [60]:
_tmp2 = _tmp.groupby(['trait', 'gene']).apply(_assign_true_class)

In [61]:
_tmp2.shape

(20837, 3)

In [62]:
_tmp2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,0
1200-Sleeplessness_insomnia,ENSG00000003756,121300,296.2,0
1200-Sleeplessness_insomnia,ENSG00000004534,121300,296.2,0
1200-Sleeplessness_insomnia,ENSG00000004838,121300,296.2,0
1200-Sleeplessness_insomnia,ENSG00000004897,121300,296.2,0


In [63]:
assert not _tmp2.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000012504']['true_class']
assert not _tmp2.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000141665']['true_class']
assert _tmp2.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112137']['true_class']

In [64]:
_tmp2['true_class'].value_counts()

0    18632
1     2205
Name: true_class, dtype: int64

In [65]:
18632 / 20837

0.8941786245620771

In [66]:
2205 / 20837

0.10582137543792293

### Add score

In [67]:
def _assign_score(x):
    trait, gene = x.name
    if gene not in smultixcan_genes_associations.index:
        return np.nan
    
    smultixcan_zscore = smultixcan_genes_associations.loc[gene, trait]
    
    fastenloc_rcp = fastenloc_genes_associations.loc[gene, trait]
    if np.isnan(fastenloc_rcp):
        fastenloc_rcp = 0.0
    
    if fastenloc_rcp > RCP_CUTOFF:
        return smultixcan_zscore
    else:
        if not np.isnan(smultixcan_zscore):
            return 0.0
        else:
            return smultixcan_zscore

In [68]:
classifier_table = _tmp2.assign(score=_tmp2.apply(_assign_score, axis=1))

In [69]:
classifier_table.shape

(20837, 4)

In [70]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,0,0.0
1200-Sleeplessness_insomnia,ENSG00000003756,121300,296.2,0,0.0
1200-Sleeplessness_insomnia,ENSG00000004534,121300,296.2,0,5.270682
1200-Sleeplessness_insomnia,ENSG00000004838,121300,296.2,0,0.0
1200-Sleeplessness_insomnia,ENSG00000004897,121300,296.2,0,0.0


In [71]:
classifier_table[classifier_table['score'].isna()].shape

(1718, 4)

In [72]:
classifier_table = classifier_table.dropna(subset=['score'])

In [73]:
classifier_table.shape

(19119, 4)

In [74]:
N_TESTS = classifier_table.reset_index().drop_duplicates(subset=['trait', 'gene']).shape[0]
display(N_TESTS)

PVALUE_THRESHOLD = (0.05 / (N_TESTS))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

19119

2.615199539724881e-06

4.69893633607862

In [75]:
def _assign_predicted_class(x):
#     trait, gene = x.name
#     smultixcan_zscore = smultixcan_genes_associations.loc[gene, trait]
#     fastenloc_rcp = fastenloc_genes_associations.loc[gene, trait]
    if x > ZSCORE_THRESHOLD:
        return 1
    else:
        return 0

In [76]:
classifier_table = classifier_table.assign(predicted_class=classifier_table['score'].apply(_assign_predicted_class))

In [77]:
classifier_table['true_class'].value_counts()

0    16917
1     2202
Name: true_class, dtype: int64

In [78]:
classifier_table['true_class'].value_counts().sum()

19119

# Save classifier table

In [79]:
classifier_table = classifier_table.sort_index()

In [80]:
assert classifier_table.index.is_unique

In [81]:
classifier_table.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pheno_mim,phecode,true_class,score,predicted_class
trait,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,121300,296.2,0,0.0,0
1200-Sleeplessness_insomnia,ENSG00000003756,121300,296.2,0,0.0,0
1200-Sleeplessness_insomnia,ENSG00000004534,121300,296.2,0,5.270682,1
1200-Sleeplessness_insomnia,ENSG00000004838,121300,296.2,0,0.0,0
1200-Sleeplessness_insomnia,ENSG00000004897,121300,296.2,0,0.0,0


In [82]:
classifier_table.shape

(19119, 5)

In [83]:
classifier_table.to_csv(
    os.path.join(output_dir, 'combined-classifier_data-phewas_catalog.tsv.gz'),
    sep='\t', index=False
)