In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import utils.constants as constants

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')
smultixcan_gene_association_dirs = os.path.join(genes_associations_dir, 'mashr')

output_dir = os.path.join(constants.RESULTS_DIR, 'roc_validation', 'classifier_tables')
os.makedirs(output_dir, exist_ok=True)

# Load all S-PrediXcan gene mappings

In [4]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load S-PrediXcan results

In [5]:
spredixcan_genes_associations_filename = os.path.join(smultixcan_gene_association_dirs, 'smultixcan-genes_associations-zscores.pkl.xz')
display(spredixcan_genes_associations_filename)

spredixcan_genes_associations = pd.read_pickle(spredixcan_genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/smultixcan-genes_associations-zscores.pkl.xz'

In [6]:
spredixcan_genes_associations.shape

(22255, 4083)

In [7]:
spredixcan_genes_associations.head(5)

Unnamed: 0_level_0,L12_EPIDERMALTHICKOTH-Other_epidermal_thickening,O42-Diagnoses_main_ICD10_O42_Premature_rupture_of_membranes,20002_1077-Noncancer_illness_code_selfreported_heart_arrhythmia,20445-Depression_possibly_related_to_childbirth,20077-Number_of_diet_questionnaires_completed,22601_91392832-Job_coding_other_work_in_this_industry_factory_hand_mate_assistant_handler_loader,I9_VTE-Venous_thromboembolism,22617_1161-Job_SOC_coding_Transport_and_distribution_managers,20002_1460-Noncancer_illness_code_selfreported_rectal_or_colon_adenomapolyps,5181-Ever_had_eye_surgery,...,20090_394-Type_of_fatoil_used_in_cooking_Unknown_soft_margarine,22617_3512-Job_SOC_coding_Aircraft_pilots_and_flight_engineers,6034-Target_heart_rate_achieved,20003_1140883066-Treatmentmedication_code_insulin_product,22601_41223241-Job_coding_accounts_and_wages_clerkassistantsupervisor_bookkeeper_cost_or_ledger_clerk_audit_assistant_budget_officer_student_loans_officer_paymaster,I82-Diagnoses_main_ICD10_I82_Other_venous_embolism_and_thrombosis,20107_12-Illnesses_of_father_Severe_depression,B07-Diagnoses_main_ICD10_B07_Viral_warts,22601_12253140-Job_coding_sports_centre_manager_riding_school_owner_sports_ground_manager_baths_manager,2664_2-Reason_for_reducing_amount_of_alcohol_drunk_Doctors_advice
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.118542,1.185152,0.506195,0.852355,0.604068,1.014925,0.874812,1.995403,1.226674,2.401157,...,0.950455,0.770204,0.788341,0.941923,1.470308,0.663953,0.270128,0.01834,2.068068,0.693219
ENSG00000000457,0.52017,1.062542,1.306725,0.989147,1.981863,0.389948,1.558053,0.780973,0.583048,0.039883,...,0.104441,0.474818,0.005264,1.766256,0.008613,0.539178,1.340549,0.745059,1.091906,0.11291
ENSG00000000460,0.181827,0.454945,0.422742,0.212844,1.121509,0.594248,3.425656,0.828176,0.877009,1.024421,...,0.181627,0.614678,1.237589,0.212545,1.021029,0.298612,1.556284,0.44378,0.261719,0.060068
ENSG00000000938,0.289141,0.212541,1.170981,0.735132,0.074349,0.626647,2.42166,2.716722,0.655375,0.536942,...,0.290044,0.717265,0.368518,1.496889,0.045447,0.018784,1.238725,0.876929,0.675103,3.171424
ENSG00000000971,0.465188,1.051131,0.797415,0.382712,1.04571,1.470092,1.435553,0.311438,1.213439,1.411339,...,1.691603,3.562145,2.040159,0.281551,0.933999,0.940077,0.14434,0.594995,0.00031,0.411814


In [8]:
spredixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

True

In [9]:
#min_pvals = pd.Series(spredixcan_genes_associations.values.flatten())
max_zscores = pd.Series(spredixcan_genes_associations.values.flatten())

In [10]:
_tmp = max_zscores.sort_values(ascending=False)

In [11]:
display(_tmp[~np.isinf(_tmp)].head())

22781443    37.737142
21129675    37.723815
5424550     37.720171
89795722    37.710072
21164553    37.698289
dtype: float64

In [12]:
# replace inf
spredixcan_genes_associations = spredixcan_genes_associations.replace(np.inf, 40)

assert not spredixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

# Load OMIM silver standard

In [13]:
omim_silver_standard = pd.read_csv(os.path.join(constants.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')

In [14]:
omim_silver_standard = omim_silver_standard.dropna(subset=['ensembl_gene_id', 'trait', 'pheno_mim'])

In [15]:
display(omim_silver_standard.shape)
display(omim_silver_standard.head())

(7809, 7)

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


### Save list of mapped UK Biobank traits

In [16]:
from clustering.biclustering.analysis import Trait

In [17]:
ukb_traits_mapped = pd.Series(omim_silver_standard['trait'].unique())
display(ukb_traits_mapped.head())

0               M41-Diagnoses_main_ICD10_M41_Scoliosis
1            H80-Diagnoses_main_ICD10_H80_Otosclerosis
2    20002_1226-Noncancer_illness_code_selfreported...
3    I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic...
4    20002_1136-Noncancer_illness_code_selfreported...
dtype: object

In [18]:
ukb_traits_mapped.shape

(107,)

In [19]:
ukb_codes_mapped = [Trait(t).trait_code for t in ukb_traits_mapped]

In [20]:
len(ukb_codes_mapped)

107

In [21]:
ukb_codes_mapped[:5]

['M41', 'H80', '20002_1226', 'I25', '20002_1136']

In [22]:
# This is needed to run gwas2gene (Yanyu's scripts)
pd.Series(ukb_codes_mapped).to_csv('/mnt/tmp/selected_ukb_traits_omim.txt', index=False, header=False)

# Read gwas2gene (Yanyu) results

In [23]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [24]:
gwas2gene_results_dir = '/mnt/phenomexcan/results/roc_validation/ukb_gwas2gene_results_omim_silver_standard/'

In [25]:
readRDS = robjects.r['readRDS']

In [26]:
f_files = glob(os.path.join(gwas2gene_results_dir, '*.rds'))
display(len(f_files))

if len(f_files) != len(ukb_codes_mapped):
    print(f'WARNING: some files are not there. {len(ukb_codes_mapped)} expected, {len(f_files)} found.')

99



In [27]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = spredixcan_genes_associations.index.intersection(set(f_gene_list))

In [28]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

# gwas2gene_all_genes = spredixcan_genes_associations.index.intersection(gwas2gene_all_genes)
# display(len(gwas2gene_all_genes))

19119

9463

# Create list of UKB-OMIM traits

In [29]:
omim_silver_standard.head()

Unnamed: 0,trait,pheno_mim,mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


# Create PrediXcan classifier table

In [30]:
_tmp = omim_silver_standard[['trait', 'ensembl_gene_id']]
ukb_traits_common = _tmp['trait'].unique()

omim_true_classes = _tmp[['trait', 'ensembl_gene_id']].drop_duplicates()
omim_true_classes = omim_true_classes.assign(omim_value=1)
omim_true_classes = omim_true_classes.set_index(['trait', 'ensembl_gene_id'])

In [31]:
len(ukb_traits_common)

107

In [32]:
omim_true_classes.shape

(7046, 1)

In [33]:
omim_true_classes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,omim_value
trait,ensembl_gene_id,Unnamed: 2_level_1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000108946,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000134250,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000114062,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000198467,1
M41-Diagnoses_main_ICD10_M41_Scoliosis,ENSG00000154864,1


In [34]:
len(ukb_traits_common)

107

In [35]:
index_tuples = []

for t in ukb_traits_common:
    t_code = Trait(t).trait_code
    if t_code not in gwas2genes_results:
        continue
    
    for g in gwas2genes_results[t_code]:
        index_tuples.append((t, g))

In [36]:
len(index_tuples)

19119

In [37]:
index_tuples[:5]

[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000012504'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000075089'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000075336'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000078401'),
 ('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263')]

In [38]:
classifier_index = pd.MultiIndex.from_tuples(
    index_tuples,
    names=['ukb_efo', 'gene']
)

In [39]:
len(gwas2gene_all_genes)

9463

In [40]:
classifier_index.shape

(19119,)

In [41]:
predixcan_classifier_df = pd.DataFrame(index=classifier_index, columns=['score', 'predicted_class', 'true_class'])

In [42]:
predixcan_classifier_df = predixcan_classifier_df.sort_index()

In [43]:
predixcan_classifier_df.shape

(19119, 3)

In [44]:
predixcan_classifier_df['true_class'] = 0

In [45]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,,,0
1200-Sleeplessness_insomnia,ENSG00000003756,,,0
1200-Sleeplessness_insomnia,ENSG00000004534,,,0
1200-Sleeplessness_insomnia,ENSG00000004838,,,0
1200-Sleeplessness_insomnia,ENSG00000004897,,,0


In [46]:
true_classes = omim_true_classes.squeeze()
display(true_classes.shape)
display(true_classes.head())

(7046,)

trait                                   ensembl_gene_id
M41-Diagnoses_main_ICD10_M41_Scoliosis  ENSG00000108946    1
                                        ENSG00000134250    1
                                        ENSG00000114062    1
                                        ENSG00000198467    1
                                        ENSG00000154864    1
Name: omim_value, dtype: int64

In [47]:
predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(true_classes.index), 'true_class'] = 1

In [48]:
assert predixcan_classifier_df['true_class'].isna().sum() == 0

In [49]:
# some testing
predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis',)].head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000012504,,,0
ENSG00000075089,,,0
ENSG00000075336,,,0
ENSG00000078401,,,0
ENSG00000090263,,,0


In [50]:
true_classes.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234')]

1

In [51]:
'ENSG00000090263' not in true_classes.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].index

True

In [52]:
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000112234'), 'true_class'] == 1.0
assert predixcan_classifier_df.loc[('M41-Diagnoses_main_ICD10_M41_Scoliosis', 'ENSG00000090263'), 'true_class'] == 0.0

In [53]:
len(gwas2gene_all_genes)

9463

In [54]:
# score
df_score = pd.Series(index=classifier_index)

for trait in ukb_traits_common:
    trait_code = Trait(trait).trait_code
    if trait_code not in gwas2genes_results:
        print(trait_code)
        continue
    trait_genes = gwas2genes_results[trait_code]
    scores = spredixcan_genes_associations.loc[trait_genes, trait]
    df_score.loc[trait] = scores.values

M13
I95
I71
20002_1264
20002_1081
22130
C80
20002_1538


In [55]:
# some testing
df_score = df_score.dropna().sort_index()
assert df_score.isna().sum().sum() == 0

In [56]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    5.986574
                             ENSG00000003756    5.540858
                             ENSG00000004534    5.270682
                             ENSG00000004838    1.168578
                             ENSG00000004897    0.042414
dtype: float64

In [57]:
# some testing
_gene, _trait = ('ENSG00000090263', 'M41-Diagnoses_main_ICD10_M41_Scoliosis')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [58]:
_gene, _trait = ('ENSG00000070061', 'O14-Diagnoses_main_ICD10_O14_Gestational_pregnancyinduced_hypertension_with_significant_proteinuria')
assert spredixcan_genes_associations.loc[_gene, _trait] == df_score.loc[_trait, _gene]

In [59]:
df_score.shape

(19119,)

In [60]:
df_score.head()

ukb_efo                      gene           
1200-Sleeplessness_insomnia  ENSG00000001617    5.986574
                             ENSG00000003756    5.540858
                             ENSG00000004534    5.270682
                             ENSG00000004838    1.168578
                             ENSG00000004897    0.042414
dtype: float64

In [61]:
df_score.min()

9.034959836740552e-05

In [62]:
df_score.max()

40.0

In [63]:
predixcan_classifier_df = predixcan_classifier_df.assign(score=df_score)

In [64]:
assert not predixcan_classifier_df['score'].isna().any()

In [65]:
from scipy import stats

In [66]:
_n_genes = len(gwas2gene_all_genes)
display(_n_genes)

_n_ukb_traits = len(ukb_traits_mapped)
display(_n_ukb_traits)

display(_n_genes * _n_ukb_traits)

PVALUE_THRESHOLD = (0.05 / (_n_genes * _n_ukb_traits))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

9463

107

1012541

4.938071643518633e-08

5.453526019972858

In [67]:
predixcan_classifier_df = predixcan_classifier_df.assign(predicted_class=(predixcan_classifier_df['score'] > ZSCORE_THRESHOLD).astype(int))

In [68]:
predixcan_classifier_df.shape

(19119, 3)

In [69]:
predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.986574,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.540858,1,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.270682,0,0
1200-Sleeplessness_insomnia,ENSG00000004838,1.168578,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.042414,0,0


In [70]:
predixcan_classifier_df.loc['M41-Diagnoses_main_ICD10_M41_Scoliosis'].sort_values('true_class', ascending=False).head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000157764,0.258736,0,1
ENSG00000162769,0.086607,0,1
ENSG00000112234,0.270457,0,1
ENSG00000012504,0.643603,0,0
ENSG00000173208,0.162836,0,0


## Select genes per trait

In [71]:
#selected_predixcan_classifier_df = predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(trait_genes_to_keep)]
selected_predixcan_classifier_df = predixcan_classifier_df

In [72]:
# some testing

In [73]:
selected_predixcan_classifier_df.shape

(19119, 3)

In [74]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.986574,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.540858,1,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.270682,0,0
1200-Sleeplessness_insomnia,ENSG00000004838,1.168578,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.042414,0,0


In [75]:
selected_predixcan_classifier_df.sort_values('predicted_class').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20002_1226-Noncancer_illness_code_selfreported_hypothyroidismmyxoedema,ENSG00000128699,0.636362,0,0
20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,ENSG00000168505,1.407419,0,0
20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,ENSG00000169372,0.46903,0,0
20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,ENSG00000171940,0.889229,0,0
20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,ENSG00000173598,0.589888,0,0


In [76]:
selected_predixcan_classifier_df.sort_values('predicted_class', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.986574,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000204618,7.450705,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000204632,12.074117,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000204644,8.480428,1,0
20002_1453-Noncancer_illness_code_selfreported_psoriasis,ENSG00000204655,7.722879,1,0


In [77]:
_tmp = selected_predixcan_classifier_df.sort_values(['true_class', 'ukb_efo'], ascending=False)
display(_tmp.shape)
display(_tmp[_tmp['true_class'] > 0].shape)
display(_tmp[_tmp['true_class'] > 0].head())

(19119, 3)

(125, 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000068305,0.540877,0,1
R69-Diagnoses_main_ICD10_R69_Unknown_and_unspecified_causes_of_morbidity,ENSG00000140443,2.216935,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000133812,1.323981,0,1
R40-Diagnoses_main_ICD10_R40_Somnolence_stupor_and_coma,ENSG00000183287,1.210377,0,1
N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter,ENSG00000131183,6.246578,1,1


### Test classes

In [78]:
selected_predixcan_classifier_df.index.get_level_values('ukb_efo').unique().shape

(99,)

In [79]:
selected_predixcan_classifier_df.index.get_level_values('gene').unique().shape

(9463,)

In [80]:
_pheno = 'N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter'

In [81]:
_clinvar_asthma_genes = omim_silver_standard[omim_silver_standard['trait'] == _pheno]['ensembl_gene_id'].unique()
display(_clinvar_asthma_genes)
display(_clinvar_asthma_genes.shape)

array(['ENSG00000075891', 'ENSG00000130600', 'ENSG00000269821',
       'ENSG00000129757', 'ENSG00000143473', 'ENSG00000275410',
       'ENSG00000019186', 'ENSG00000134371', 'ENSG00000036828',
       'ENSG00000088256', 'ENSG00000187091', 'ENSG00000122194',
       'ENSG00000138592', 'ENSG00000040531', 'ENSG00000138079',
       'ENSG00000021488', 'ENSG00000090402', 'ENSG00000131482',
       'ENSG00000137700', 'ENSG00000164007', 'ENSG00000113946',
       'ENSG00000116039', 'ENSG00000168000', 'ENSG00000123191',
       'ENSG00000171365', 'ENSG00000165704', 'ENSG00000042753',
       'ENSG00000075643', 'ENSG00000169692', 'ENSG00000109667',
       'ENSG00000131183', 'ENSG00000109062', 'ENSG00000149257',
       'ENSG00000198931', 'ENSG00000157388', 'ENSG00000167207',
       'ENSG00000124827', 'ENSG00000134873'], dtype=object)

(38,)

In [82]:
_tmp = selected_predixcan_classifier_df.loc[_pheno]
_tmp.loc[_tmp.index.intersection(_clinvar_asthma_genes)]

Unnamed: 0,score,predicted_class,true_class
ENSG00000131183,6.246578,1,1


In [83]:
_predixcan_asthma_genes = selected_predixcan_classifier_df.loc[_pheno]

In [84]:
_predixcan_asthma_genes.head()

Unnamed: 0_level_0,score,predicted_class,true_class
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000005187,1.587926,0,0
ENSG00000005189,0.457434,0,0
ENSG00000011638,0.059849,0,0
ENSG00000027847,1.586332,0,0
ENSG00000048140,0.688334,0,0


In [85]:
selected_predixcan_classifier_df.shape

(19119, 3)

In [86]:
selected_predixcan_classifier_df['predicted_class'].value_counts()

0    17397
1     1722
Name: predicted_class, dtype: int64

In [87]:
selected_predixcan_classifier_df['true_class'].value_counts()

0    18994
1      125
Name: true_class, dtype: int64

In [88]:
selected_predixcan_classifier_df.sort_values(['true_class'], ascending=[False])

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,ENSG00000185950,0.646109,0,1
22127-Doctor_diagnosed_asthma,ENSG00000232810,1.996019,0,1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000231852,7.628562,1,1
20002_1111-Noncancer_illness_code_selfreported_asthma,ENSG00000204539,11.715739,1,1
N20-Diagnoses_main_ICD10_N20_Calculus_of_kidney_and_ureter,ENSG00000131183,6.246578,1,1
20002_1197-Noncancer_illness_code_selfreported_kidney_stoneureter_stonebladder_stone,ENSG00000019186,0.973195,0,1
20002_1309-Noncancer_illness_code_selfreported_osteoporosis,ENSG00000162337,0.003963,0,1
20002_1381-Noncancer_illness_code_selfreported_systemic_lupus_erythematosissle,ENSG00000244731,8.850207,1,1
20002_1065-Noncancer_illness_code_selfreported_hypertension,ENSG00000184937,5.294412,0,1
I25-Diagnoses_main_ICD10_I25_Chronic_ischaemic_heart_disease,ENSG00000111252,3.635113,0,1


# Save classifier table

In [89]:
selected_predixcan_classifier_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,predicted_class,true_class
ukb_efo,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1200-Sleeplessness_insomnia,ENSG00000001617,5.986574,1,0
1200-Sleeplessness_insomnia,ENSG00000003756,5.540858,1,0
1200-Sleeplessness_insomnia,ENSG00000004534,5.270682,0,0
1200-Sleeplessness_insomnia,ENSG00000004838,1.168578,0,0
1200-Sleeplessness_insomnia,ENSG00000004897,0.042414,0,0


In [90]:
selected_predixcan_classifier_df.shape

(19119, 3)

In [91]:
selected_predixcan_classifier_df.to_csv(
    os.path.join(output_dir, 'smultixcan-mashr-classifier_data.tsv.gz'),
    sep='\t', index=False
)