In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf

In [3]:
output_dir = os.path.join(conf.DELIVERABLES_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load gene mappings

In [5]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load fastENLOC results

In [6]:
spredixcan_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, 'fastenloc-torus-rcp.pkl.xz')
display(spredixcan_genes_associations_filename)

spredixcan_genes_associations = pd.read_pickle(spredixcan_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [None]:
#spredixcan_genes_associations = spredixcan_genes_associations.rename(index=genes_mapping_1)

In [7]:
spredixcan_genes_associations.shape

(37967, 4091)

In [8]:
spredixcan_genes_associations.head(5)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,2.1732e-07,3.6e-05,1.020227e-10,2e-05,0.001497,3.42677e-11,1.3123e-10,1.5086e-09,1.1145e-05,4.1646e-09,...,0.001,,0.0,0.0,,0.001,,,,0.0
ENSG00000000457,1.3439e-06,0.000692,8.21606e-11,8.9e-05,0.004363,1.342669e-10,4.223476e-10,2.036232e-08,0.0004598924,3.57085e-08,...,0.0,,,,,0.0,,,,
ENSG00000000460,8.75775e-06,0.001713,1.11901e-10,1.6e-05,0.003566,1.533228e-10,3.7123e-10,6.5057e-09,1.8302e-05,6.0029e-08,...,0.0,,,,,0.0,,,,
ENSG00000000938,7.197e-08,0.000446,3.573442e-10,6e-06,0.004429,2.588022e-08,3.4615e-10,9.1102e-09,1.3514e-05,9.878e-09,...,0.263,,0.002,,,,,,,0.0
ENSG00000000971,2.72e-07,0.00031,1.3973e-10,0.000182,0.00316,3.940395e-11,1.7856e-10,2.329954e-10,5.325e-07,4.95874e-08,...,0.0,0.0,,,,0.0,,,,0.0


# Load OMIM silver standard

In [16]:
omim_silver_standard = pd.read_csv(os.path.join(conf.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [17]:
omim_silver_standard = omim_silver_standard.dropna(subset=['ensembl_gene_id', 'trait', 'pheno_mim'])

In [18]:
display(omim_silver_standard.shape)
display(omim_silver_standard.head())

(7809, 7)

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


### Save list of mapped UK Biobank traits

In [19]:
from entity import Trait

In [20]:
ukb_traits_mapped = pd.Series(omim_silver_standard['trait'].unique())
display(ukb_traits_mapped.head())

0               M41-Diagnoses_main_ICD10_M41_Scoliosis
1            H80-Diagnoses_main_ICD10_H80_Otosclerosis
2    20002_1226-Noncancer_illness_code_selfreported...
3    I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
4    20002_1136-Noncancer_illness_code_selfreported...
dtype: object

In [21]:
ukb_traits_mapped.shape

(107,)

In [22]:
ukb_codes_mapped = [Trait(full_code=t).code for t in ukb_traits_mapped]

In [23]:
len(ukb_codes_mapped)

107

In [24]:
ukb_codes_mapped[:5]

['M41', 'H80', '20002_1226', 'I25', '20002_1136']

In [25]:
# This is needed to run gwas2gene (Yanyu's scripts)
pd.Series(ukb_codes_mapped).to_csv('/mnt/tmp/selected_ukb_traits_omim.txt', index=False, header=False)

# Read gwas2gene results

In [26]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [28]:
readRDS = robjects.r['readRDS']

In [29]:
f_files = glob(os.path.join(conf.OMIM_SILVER_STANDARD_GWAS_TO_GENE_DIR, '*.rds'))
display(len(f_files))

if len(f_files) != len(ukb_codes_mapped):
    print(f'WARNING: some files are not there. {len(ukb_codes_mapped)} expected, {len(f_files)} found.')

99



In [30]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = spredixcan_genes_associations.index.intersection(set(f_gene_list))

In [31]:
len(gwas2genes_results)

99

In [32]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

# gwas2gene_all_genes = spredixcan_genes_associations.index.intersection(gwas2gene_all_genes)
# display(len(gwas2gene_all_genes))

19709

9722

# Create list of UKB-OMIM traits

In [33]:
omim_silver_standard.head()

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


# Create PrediXcan classifier table

In [34]:
_tmp = omim_silver_standard[['trait', 'ensembl_gene_id']]
ukb_traits_common = _tmp['trait'].unique()

omim_true_classes = _tmp[['trait', 'ensembl_gene_id']].drop_duplicates()
omim_true_classes = omim_true_classes.assign(omim_value=1)
omim_true_classes = omim_true_classes.set_index(['trait', 'ensembl_gene_id'])

In [35]:
len(ukb_traits_common)

107

In [36]:
omim_true_classes.shape

(7046, 1)

In [37]:
omim_true_classes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,omim_value
trait,ensembl_gene_id,Unnamed: 2_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000108946,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000134250,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000114062,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000198467,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000154864,1


In [38]:
len(ukb_traits_common)

107

In [39]:
index_tuples = []

for t in ukb_traits_common:
    t_code = Trait(full_code=t).code
    if t_code not in gwas2genes_results:
        continue
    
    for g in gwas2genes_results[t_code]:
        index_tuples.append((t, g))

In [40]:
len(index_tuples)

19709

In [41]:
index_tuples[:5]

[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000012504'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000075089'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000075336'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000078401'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263')]

In [42]:
classifier_index = pd.MultiIndex.from_tuples(
    index_tuples,
    names=['ukb_efo', 'gene']
)

In [43]:
len(gwas2gene_all_genes)

9722

In [44]:
classifier_index.shape

(19709,)

In [45]:
predixcan_classifier_df = pd.DataFrame(index=classifier_index, columns=['score', 'predicted_class', 'true_class'])

In [46]:
predixcan_classifier_df = predixcan_classifier_df.sort_index()

In [47]:
predixcan_classifier_df.shape

(19709, 3)

In [48]:
predixcan_classifier_df['true_class'] = 0

In [49]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,,,0
1200-Sleeplessness_insomnia,ENSG00000003756,,,0
1200-Sleeplessness_insomnia,ENSG00000004534,,,0
1200-Sleeplessness_insomnia,ENSG00000004838,,,0
1200-Sleeplessness_insomnia,ENSG00000004897,,,0


In [50]:
true_classes = omim_true_classes.squeeze()
display(true_classes.shape)
display(true_classes.head())

(7046,)

trait                                   ensembl_gene_id
M41-Diagnoses_main_ICD10_M41_Scoliosis  ENSG00000108946    1
                                        ENSG00000134250    1
                                        ENSG00000114062    1
                                        ENSG00000198467    1
                                        ENSG00000154864    1
Name: omim_value, dtype: int64

In [51]:
predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(true_classes.index), 'true_class'] = 1

In [52]:
assert predixcan_classifier_df['true_class'].isna().sum() == 0

In [53]:
# some testing
predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis',)].head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000012504,,,0
ENSG00000075089,,,0
ENSG00000075336,,,0
ENSG00000078401,,,0
ENSG00000090263,,,0


In [54]:
true_classes.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234')]

1

In [55]:
'ENSG00000090263' not in true_classes.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].index

True

In [56]:
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234'), 'true_class'] == 1.0
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263'), 'true_class'] == 0.0

In [57]:
len(gwas2gene_all_genes)

9722

In [59]:
# score
df_score = pd.Series(index=classifier_index)

for trait in ukb_traits_common:
    trait_code = Trait(full_code=trait).code
    if trait_code not in gwas2genes_results:
        print(trait_code)
        continue
    trait_genes = gwas2genes_results[trait_code]
    scores = spredixcan_genes_associations.loc[trait_genes, trait]
    df_score.loc[trait] = scores.values

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [60]:
# some testing
df_score = df_score.dropna().sort_index()
assert df_score.isna().sum().sum() == 0

In [61]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    2.563019e-02
                             ENSG00000003756    7.435000e-02
                             ENSG00000004534    4.208160e-01
                             ENSG00000004838    4.417000e-04
                             ENSG00000004897    1.116773e-09
dtype: float64

In [62]:
# some testing
_gene, _trait = ('ENSG00000090263', 'M41-Diagnoses_main_ICD10_M41_Scoliosis')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [63]:
_gene, _trait = ('ENSG00000070061', 'O14-Diagnoses_main_ICD10_O14_Gestational_pregnancyinduced_hypertension_with_significant_proteinuria')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [64]:
df_score.shape

(19689,)

In [65]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    2.563019e-02
                             ENSG00000003756    7.435000e-02
                             ENSG00000004534    4.208160e-01
                             ENSG00000004838    4.417000e-04
                             ENSG00000004897    1.116773e-09
dtype: float64

In [66]:
df_score.min()

0.0

In [67]:
df_score.max()

1.759235608

In [68]:
predixcan_classifier_df = predixcan_classifier_df.assign(score=df_score)

In [70]:
# assert not predixcan_classifier_df['score'].isna().any()

In [71]:
from scipy import stats

In [72]:
_n_genes = len(gwas2gene_all_genes)
display(_n_genes)

SCORE_THRESHOLD = 0.1
display(SCORE_THRESHOLD)

9722

0.1

In [73]:
predixcan_classifier_df = predixcan_classifier_df.assign(predicted_class=(predixcan_classifier_df['score'] > SCORE_THRESHOLD).astype(int))

In [74]:
predixcan_classifier_df.shape

(19709, 3)

In [75]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,0.02563019,0,0
1200-Sleeplessness_insomnia,ENSG00000003756,0.07435,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,0.420816,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.0004417,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,1.116773e-09,0,0


In [76]:
predixcan_classifier_df.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].sort_values('true_class', ascending=False).head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000157764,6.9122e-08,0,1
ENSG00000162769,3.23106e-06,0,1
ENSG00000112234,6.2225e-07,0,1
ENSG00000012504,1.7728e-11,0,0
ENSG00000173208,1.7775e-07,0,0


## Select genes per trait

In [77]:
#selected_predixcan_classifier_df = predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(trait_genes_to_keep)]
selected_predixcan_classifier_df = predixcan_classifier_df

In [78]:
# some testing

In [79]:
selected_predixcan_classifier_df.shape

(19709, 3)

In [80]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,0.02563019,0,0
1200-Sleeplessness_insomnia,ENSG00000003756,0.07435,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,0.420816,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.0004417,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,1.116773e-09,0,0


In [81]:
selected_predixcan_classifier_df.sort_values('predicted_class', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000146830,0.895277,1,0
I48-Diagnoses_main_ICD10_I48_Atrial_fibrillation_and_flutter,ENSG00000148120,0.1568,1,0
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000008283,0.307305,1,0
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000008083,0.2033,1,0
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000006047,0.187403,1,0


In [82]:
_tmp = selected_predixcan_classifier_df.sort_values(['true_class', 'ukb_efo'], ascending=False)
display(_tmp.shape)
display(_tmp[_tmp['true_class'] > 0].shape)
display(_tmp[_tmp['true_class'] > 0].head())

(19709, 3)

(126, 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000068305,5.758e-06,0,1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000140443,4.7581e-06,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000133812,5.454e-11,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000183287,1.14338e-10,0,1
N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter,ENSG00000131183,0.002062,0,1


### Test classes

In [83]:
selected_predixcan_classifier_df.index.get_level_values('ukb_efo').unique().shape

(99,)

In [84]:
selected_predixcan_classifier_df.index.get_level_values('gene').unique().shape

(9722,)

In [85]:
_pheno = 'N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter'

In [86]:
_clinvar_asthma_genes = omim_silver_standard[omim_silver_standard['trait'] == _pheno]['ensembl_gene_id'].unique()
display(_clinvar_asthma_genes)
display(_clinvar_asthma_genes.shape)

array(['ENSG00000075891', 'ENSG00000130600', 'ENSG00000269821',
       'ENSG00000129757', 'ENSG00000143473', 'ENSG00000275410',
       'ENSG00000019186', 'ENSG00000134371', 'ENSG00000036828',
       'ENSG00000088256', 'ENSG00000187091', 'ENSG00000122194',
       'ENSG00000138592', 'ENSG00000040531', 'ENSG00000138079',
       'ENSG00000021488', 'ENSG00000090402', 'ENSG00000131482',
       'ENSG00000137700', 'ENSG00000164007', 'ENSG00000113946',
       'ENSG00000116039', 'ENSG00000168000', 'ENSG00000123191',
       'ENSG00000171365', 'ENSG00000165704', 'ENSG00000042753',
       'ENSG00000075643', 'ENSG00000169692', 'ENSG00000109667',
       'ENSG00000131183', 'ENSG00000109062', 'ENSG00000149257',
       'ENSG00000198931', 'ENSG00000157388', 'ENSG00000167207',
       'ENSG00000124827', 'ENSG00000134873'], dtype=object)

(38,)

In [87]:
_tmp = selected_predixcan_classifier_df.loc[_pheno]
_tmp.loc[_tmp.index.intersection(_clinvar_asthma_genes)]

Unnamed: 0,score,predicted_class,true_class
ENSG00000131183,0.002062,0,1


In [88]:
_predixcan_asthma_genes = selected_predixcan_classifier_df.loc[_pheno]

In [89]:
_predixcan_asthma_genes.head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000005187,2.666423e-09,0,0
ENSG00000005189,3.43733e-09,0,0
ENSG00000011638,3.278863e-09,0,0
ENSG00000027847,2.658019e-08,0,0
ENSG00000048140,1.2442e-08,0,0


In [90]:
selected_predixcan_classifier_df.shape

(19709, 3)

In [91]:
selected_predixcan_classifier_df['predicted_class'].value_counts()

0    18679
1     1030
Name: predicted_class, dtype: int64

In [92]:
selected_predixcan_classifier_df['true_class'].value_counts()

0    19583
1      126
Name: true_class, dtype: int64

In [93]:
selected_predixcan_classifier_df.sort_values(['true_class'], ascending=[False])

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000179218,7.260000e-04,0,1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000111252,7.058000e-02,0,1
E66-Diagnoses_main_ICD10_E66_Obesity,ENSG00000176842,1.501600e-10,0,1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000111057,7.881000e-10,0,1
22127-Doctor_diagnosed_asthma,ENSG00000232810,1.597013e-07,0,1
20002_1111-Noncancer_illness_code_selfreported_asthma,ENSG00000124299,4.289000e-06,0,1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000125730,5.730000e-04,0,1
20002_1456-Noncancer_illness_code_selfreported_malabsorptioncoeliac_disease,ENSG00000196735,3.151000e-10,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000133812,5.454000e-11,0,1
M05-Diagnoses_main_ICD10_M05_Seropositive_rheumatoid_arthritis,ENSG00000204498,9.828148e-04,0,1


# Save classifier table

In [94]:
# remove nans
selected_predixcan_classifier_df = selected_predixcan_classifier_df.dropna()

In [95]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,0.02563019,0,0
1200-Sleeplessness_insomnia,ENSG00000003756,0.07435,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,0.420816,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.0004417,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,1.116773e-09,0,0


In [96]:
selected_predixcan_classifier_df.shape

(19689, 3)

In [97]:
selected_predixcan_classifier_df.to_csv(
    os.path.join(output_dir, 'fastenloc-torus-classifier_data.tsv.gz'),
    sep='\t', index=False
)