In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import utils.constants as constants

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'en_not_fixed')

output_dir = os.path.join(constants.RESULTS_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load all S-PrediXcan gene mappings

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load S-PrediXcan results

In [5]:
spredixcan_genes_associations_filename = os.path.join(smultixcan_gene_association_dirs, 'smultixcan-genes_associations-zscores.pkl.xz')
display(spredixcan_genes_associations_filename)

spredixcan_genes_associations = pd.read_pickle(spredixcan_genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/en_not_fixed/smultixcan-genes_associations-zscores.pkl.xz'

In [6]:
spredixcan_genes_associations.shape

(19910, 4077)

In [7]:
spredixcan_genes_associations.head(5)

Unnamed: 0_level_0,20003_1141171932-Treatmentmedication_code_levetiracetam,F99-Diagnoses_main_ICD10_F99_Mental_disorder_not_otherwise_specified,4674-Private_healthcare,100670-White_wine_intake,6150_2-Vascularheart_problems_diagnosed_by_doctor_Angina,D23-Diagnoses_main_ICD10_D23_Other_benign_neoplasms_of_skin,20003_1140861090-Treatmentmedication_code_adalat_5mg_capsule,20003_99999-Treatmentmedication_code_Freetext_entry_unable_to_be_coded,20003_1140865872-Treatmentmedication_code_magnesium_citrate,22601_81172834-Job_coding_metal_making_or_metal_treating_process_worker_machine_operator_furnaceman,...,22617_3433-Job_SOC_coding_Public_relations_officers,6070_1-OCT_measured_right_Measurable,I9_IHD-Ischaemic_heart_disease_wide_definition,I9_ARTEMBTHR-Arterial_embolism_and_thrombosis,K25-Diagnoses_main_ICD10_K25_Gastric_ulcer,20110_8-Illnesses_of_mother_High_blood_pressure,M13_LATERALEPICOND-Lateral_epicondylitis,DM_RETINOPAT_NOS-Unclassified_diabetic_retinopathy,20003_1141189094-Treatmentmedication_code_avandamet_1mg_500mg_tablet,20118_5-Home_area_population_density_urban_or_rural_EnglandWales_Urban_less_sparse
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.397276,0.604616,0.775742,1.436307,1.412375,1.600041,0.028202,0.067765,0.40245,0.131819,...,1.999547,1.127596,0.35693,0.824782,0.082111,0.836645,0.622258,2.076279,0.688793,2.08774
ENSG00000000457,0.319545,2.279867,0.110495,1.164098,0.505677,1.277273,0.410254,0.838215,0.092569,2.099556,...,0.392008,0.354665,0.66534,0.961071,0.679878,2.025639,0.74599,0.476264,0.362441,2.131233
ENSG00000000460,0.015547,2.091471,0.068122,2.869138,4.393092,0.159562,0.062802,0.342448,0.164821,1.794037,...,0.490251,0.42036,2.021611,0.05123,0.898618,1.938927,2.192165,0.802752,2.289388,3.226133
ENSG00000000938,0.096502,0.736958,2.556985,0.117943,3.601362,0.804478,0.280109,0.552019,0.242493,0.124827,...,0.297255,1.215982,4.066878,0.578105,1.536092,2.17154,0.032683,0.334689,0.245518,0.203841
ENSG00000001036,1.668204,0.03811,0.534008,0.136528,0.112277,0.330241,0.189969,0.33502,0.403565,0.771542,...,0.180306,2.395588,0.18654,0.790372,0.136059,2.470515,1.457462,0.151217,0.06206,0.666584


In [8]:
spredixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

True

In [9]:
#min_pvals = pd.Series(spredixcan_genes_associations.values.flatten())
max_zscores = pd.Series(spredixcan_genes_associations.values.flatten())

In [10]:
_tmp = max_zscores.sort_values(ascending=False)

In [11]:
display(_tmp[~np.isinf(_tmp)].head())

20904547    37.691094
68870300    37.681572
61904608    37.601920
61616379    37.587317
59310196    37.577470
dtype: float64

In [12]:
# replace inf
spredixcan_genes_associations = spredixcan_genes_associations.replace(np.inf, 40)

assert not spredixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

# Load OMIM silver standard

In [13]:
omim_silver_standard = pd.read_csv(os.path.join(constants.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [14]:
omim_silver_standard = omim_silver_standard.dropna(subset=['ensembl_gene_id', 'trait', 'pheno_mim'])

In [15]:
display(omim_silver_standard.shape)
display(omim_silver_standard.head())

(7809, 7)

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


### Save list of mapped UK Biobank traits

In [16]:
from clustering.biclustering.analysis import Trait

In [17]:
ukb_traits_mapped = pd.Series(omim_silver_standard['trait'].unique())
display(ukb_traits_mapped.head())

0               M41-Diagnoses_main_ICD10_M41_Scoliosis
1            H80-Diagnoses_main_ICD10_H80_Otosclerosis
2    20002_1226-Noncancer_illness_code_selfreported...
3    I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
4    20002_1136-Noncancer_illness_code_selfreported...
dtype: object

In [18]:
ukb_traits_mapped.shape

(107,)

In [19]:
ukb_codes_mapped = [Trait(t).trait_code for t in ukb_traits_mapped]

In [20]:
len(ukb_codes_mapped)

107

In [21]:
ukb_codes_mapped[:5]

['M41', 'H80', '20002_1226', 'I25', '20002_1136']

In [22]:
# This is needed to run gwas2gene (Yanyu's scripts)
pd.Series(ukb_codes_mapped).to_csv('/mnt/tmp/selected_ukb_traits_omim.txt', index=False, header=False)

# Read gwas2gene (Yanyu) results

In [23]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [24]:
gwas2gene_results_dir = '/mnt/phenomexcan/results/roc_validation/ukb_gwas2gene_results_omim_silver_standard/'

In [25]:
readRDS = robjects.r['readRDS']

In [26]:
f_files = glob(os.path.join(gwas2gene_results_dir, '*.rds'))
display(len(f_files))

if len(f_files) != len(ukb_codes_mapped):
    print(f'WARNING: some files are not there. {len(ukb_codes_mapped)} expected, {len(f_files)} found.')

99



In [27]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = spredixcan_genes_associations.index.intersection(set(f_gene_list))

In [28]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

# gwas2gene_all_genes = spredixcan_genes_associations.index.intersection(gwas2gene_all_genes)
# display(len(gwas2gene_all_genes))

18330

9058

# Create list of UKB-OMIM traits

In [29]:
omim_silver_standard.head()

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


# Create PrediXcan classifier table

In [30]:
_tmp = omim_silver_standard[['trait', 'ensembl_gene_id']]
ukb_traits_common = _tmp['trait'].unique()

omim_true_classes = _tmp[['trait', 'ensembl_gene_id']].drop_duplicates()
omim_true_classes = omim_true_classes.assign(omim_value=1)
omim_true_classes = omim_true_classes.set_index(['trait', 'ensembl_gene_id'])

In [31]:
len(ukb_traits_common)

107

In [32]:
omim_true_classes.shape

(7046, 1)

In [33]:
omim_true_classes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,omim_value
trait,ensembl_gene_id,Unnamed: 2_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000108946,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000134250,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000114062,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000198467,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000154864,1


In [34]:
len(ukb_traits_common)

107

In [35]:
index_tuples = []

for t in ukb_traits_common:
    t_code = Trait(t).trait_code
    if t_code not in gwas2genes_results:
        continue
    
    for g in gwas2genes_results[t_code]:
        index_tuples.append((t, g))

In [36]:
len(index_tuples)

18330

In [37]:
index_tuples[:5]

[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000075336'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000078401'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090266'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000095951')]

In [38]:
classifier_index = pd.MultiIndex.from_tuples(
    index_tuples,
    names=['ukb_efo', 'gene']
)

In [39]:
len(gwas2gene_all_genes)

9058

In [40]:
classifier_index.shape

(18330,)

In [41]:
predixcan_classifier_df = pd.DataFrame(index=classifier_index, columns=['score', 'predicted_class', 'true_class'])

In [42]:
predixcan_classifier_df = predixcan_classifier_df.sort_index()

In [43]:
predixcan_classifier_df.shape

(18330, 3)

In [44]:
predixcan_classifier_df['true_class'] = 0

In [45]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,,,0
1200-Sleeplessness_insomnia,ENSG00000003756,,,0
1200-Sleeplessness_insomnia,ENSG00000004534,,,0
1200-Sleeplessness_insomnia,ENSG00000004838,,,0
1200-Sleeplessness_insomnia,ENSG00000004897,,,0


In [46]:
true_classes = omim_true_classes.squeeze()
display(true_classes.shape)
display(true_classes.head())

(7046,)

trait                                   ensembl_gene_id
M41-Diagnoses_main_ICD10_M41_Scoliosis  ENSG00000108946    1
                                        ENSG00000134250    1
                                        ENSG00000114062    1
                                        ENSG00000198467    1
                                        ENSG00000154864    1
Name: omim_value, dtype: int64

In [47]:
predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(true_classes.index), 'true_class'] = 1

In [48]:
assert predixcan_classifier_df['true_class'].isna().sum() == 0

In [49]:
predixcan_classifier_df['true_class'].value_counts()

0    18208
1      122
Name: true_class, dtype: int64

In [50]:
# some testing
predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis',)].head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000075336,,,0
ENSG00000078401,,,0
ENSG00000090263,,,0
ENSG00000090266,,,0
ENSG00000095951,,,0


In [51]:
true_classes.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234')]

1

In [52]:
'ENSG00000090263' not in true_classes.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].index

True

In [53]:
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234'), 'true_class'] == 1.0
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263'), 'true_class'] == 0.0

In [54]:
len(gwas2gene_all_genes)

9058

In [55]:
# score
df_score = pd.Series(index=classifier_index)

for trait in ukb_traits_common:
    trait_code = Trait(trait).trait_code
    if trait_code not in gwas2genes_results:
        print(trait_code)
        continue
    trait_genes = gwas2genes_results[trait_code]
    scores = spredixcan_genes_associations.loc[trait_genes, trait]
    df_score.loc[trait] = scores.values

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [56]:
# some testing
df_score = df_score.dropna().sort_index()
assert df_score.isna().sum().sum() == 0

In [57]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    5.467206
                             ENSG00000003756    5.163661
                             ENSG00000004534    5.500549
                             ENSG00000004838    0.882240
                             ENSG00000004897    0.150213
dtype: float64

In [58]:
# some testing
_gene, _trait = ('ENSG00000090263', 'M41-Diagnoses_main_ICD10_M41_Scoliosis')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [59]:
_gene, _trait = ('ENSG00000070061', 'O14-Diagnoses_main_ICD10_O14_Gestational_pregnancyinduced_hypertension_with_significant_proteinuria')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [60]:
df_score.shape

(18330,)

In [61]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    5.467206
                             ENSG00000003756    5.163661
                             ENSG00000004534    5.500549
                             ENSG00000004838    0.882240
                             ENSG00000004897    0.150213
dtype: float64

In [62]:
df_score.min()

3.747375629054999e-08

In [63]:
df_score.max()

40.0

In [64]:
predixcan_classifier_df = predixcan_classifier_df.assign(score=df_score)

In [65]:
assert not predixcan_classifier_df['score'].isna().any()

In [66]:
from scipy import stats

In [67]:
_n_genes = len(gwas2gene_all_genes)
display(_n_genes)

_n_ukb_traits = len(ukb_traits_mapped)
display(_n_ukb_traits)

display(_n_genes * _n_ukb_traits)

PVALUE_THRESHOLD = (0.05 / (_n_genes * _n_ukb_traits))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

9058

107

969206

5.158861996314509e-08

5.445746310266604

In [68]:
predixcan_classifier_df = predixcan_classifier_df.assign(predicted_class=(predixcan_classifier_df['score'] > ZSCORE_THRESHOLD).astype(int))

In [69]:
predixcan_classifier_df.shape

(18330, 3)

In [70]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.467206,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.163661,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.500549,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.88224,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.150213,0,0


In [71]:
predixcan_classifier_df.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].sort_values('true_class', ascending=False).head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000157764,1.79563,0,1
ENSG00000112234,0.930357,0,1
ENSG00000162769,0.189666,0,1
ENSG00000075336,0.88619,0,0
ENSG00000166347,0.252202,0,0


## Select genes per trait

In [72]:
#selected_predixcan_classifier_df = predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(trait_genes_to_keep)]
selected_predixcan_classifier_df = predixcan_classifier_df

In [73]:
# some testing

In [74]:
selected_predixcan_classifier_df.shape

(18330, 3)

In [75]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.467206,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.163661,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.500549,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.88224,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.150213,0,0


In [76]:
selected_predixcan_classifier_df.sort_values('predicted_class').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000126561,3.463177,0,0
20002_1459-Noncancer_illness_code_selfreported_colitisnot_crohns_or_ulcerative_colitis,ENSG00000166278,1.258235,0,0
20002_1459-Noncancer_illness_code_selfreported_colitisnot_crohns_or_ulcerative_colitis,ENSG00000168477,0.321865,0,0
20002_1459-Noncancer_illness_code_selfreported_colitisnot_crohns_or_ulcerative_colitis,ENSG00000179344,2.735683,0,0
20002_1459-Noncancer_illness_code_selfreported_colitisnot_crohns_or_ulcerative_colitis,ENSG00000196126,3.369544,0,0


In [77]:
selected_predixcan_classifier_df.sort_values('predicted_class', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.467206,1,0
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,ENSG00000175003,10.307097,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000241404,18.607254,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000242574,5.741933,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000243649,12.103758,1,0


In [78]:
_tmp = selected_predixcan_classifier_df.sort_values(['true_class', 'ukb_efo'], ascending=False)
display(_tmp.shape)
display(_tmp[_tmp['true_class'] > 0].shape)
display(_tmp[_tmp['true_class'] > 0].head())

(18330, 3)

(122, 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000068305,0.611996,0,1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000140443,1.579324,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000133812,2.943825,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000183287,1.652494,0,1
N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter,ENSG00000131183,6.566289,1,1


### Test classes

In [79]:
selected_predixcan_classifier_df.index.get_level_values('ukb_efo').unique().shape

(99,)

In [80]:
selected_predixcan_classifier_df.index.get_level_values('gene').unique().shape

(9058,)

In [81]:
_pheno = 'N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter'

In [82]:
_clinvar_asthma_genes = omim_silver_standard[omim_silver_standard['trait'] == _pheno]['ensembl_gene_id'].unique()
display(_clinvar_asthma_genes)
display(_clinvar_asthma_genes.shape)

array(['ENSG00000075891', 'ENSG00000130600', 'ENSG00000269821',
       'ENSG00000129757', 'ENSG00000143473', 'ENSG00000275410',
       'ENSG00000019186', 'ENSG00000134371', 'ENSG00000036828',
       'ENSG00000088256', 'ENSG00000187091', 'ENSG00000122194',
       'ENSG00000138592', 'ENSG00000040531', 'ENSG00000138079',
       'ENSG00000021488', 'ENSG00000090402', 'ENSG00000131482',
       'ENSG00000137700', 'ENSG00000164007', 'ENSG00000113946',
       'ENSG00000116039', 'ENSG00000168000', 'ENSG00000123191',
       'ENSG00000171365', 'ENSG00000165704', 'ENSG00000042753',
       'ENSG00000075643', 'ENSG00000169692', 'ENSG00000109667',
       'ENSG00000131183', 'ENSG00000109062', 'ENSG00000149257',
       'ENSG00000198931', 'ENSG00000157388', 'ENSG00000167207',
       'ENSG00000124827', 'ENSG00000134873'], dtype=object)

(38,)

In [83]:
_tmp = selected_predixcan_classifier_df.loc[_pheno]
_tmp.loc[_tmp.index.intersection(_clinvar_asthma_genes)]

Unnamed: 0,score,predicted_class,true_class
ENSG00000131183,6.566289,1,1


In [84]:
_predixcan_asthma_genes = selected_predixcan_classifier_df.loc[_pheno]

In [85]:
_predixcan_asthma_genes.head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000005187,2.629161,0,0
ENSG00000005189,2.183872,0,0
ENSG00000011638,1.033352,0,0
ENSG00000027847,0.92211,0,0
ENSG00000048140,1.544258,0,0


In [86]:
selected_predixcan_classifier_df.shape

(18330, 3)

In [87]:
selected_predixcan_classifier_df['predicted_class'].value_counts()

0    16105
1     2225
Name: predicted_class, dtype: int64

In [88]:
selected_predixcan_classifier_df['true_class'].value_counts()

0    18208
1      122
Name: true_class, dtype: int64

In [89]:
selected_predixcan_classifier_df.sort_values(['true_class'], ascending=[False])

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20002_1111-Noncancer_illness_code_selfreported_asthma,ENSG00000169194,12.736808,1,1
20002_1309-Noncancer_illness_code_selfreported_osteoporosis,ENSG00000106080,0.673340,0,1
N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter,ENSG00000131183,6.566289,1,1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000115705,9.620747,1,1
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,ENSG00000111252,3.933903,0,1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000115415,2.964074,0,1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000171862,3.526507,0,1
20002_1075-Noncancer_illness_code_selfreported_heart_attackmyocardial_infarction,ENSG00000111252,3.159834,0,1
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,ENSG00000185950,0.260638,0,1
I48-Diagnoses_main_ICD10_I48_Atrial_fibrillation_and_flutter,ENSG00000120457,0.689504,0,1


# Save classifier table

In [103]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.467206,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.163661,0,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.500549,1,0
1200-Sleeplessness_insomnia,ENSG00000004838,0.88224,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.150213,0,0


In [104]:
selected_predixcan_classifier_df.shape

(18330, 3)

In [105]:
selected_predixcan_classifier_df.to_csv(
    os.path.join(output_dir, 'smultixcan-en_not_fixed-classifier_data.tsv.gz'),
    sep='\t', index=False
)