In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf

In [3]:
output_dir = os.path.join(conf.DELIVERABLES_DIR, 'roc_validation', 'classifier_tables', 't2d')
os.makedirs(output_dir, exist_ok=True)
display(output_dir)

'/mnt/phenomexcan_base/deliverables/roc_validation/classifier_tables/t2d'

# Load gene mappings

In [4]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load T2D genes

In [5]:
t2d_genes_filename = os.path.join(output_dir, 't2d_genes.pkl.xz')
display(t2d_genes_filename)

'/mnt/phenomexcan_base/deliverables/roc_validation/classifier_tables/t2d/t2d_genes.pkl.xz'

In [6]:
all_t2d_genes = pd.read_pickle(t2d_genes_filename)

In [7]:
all_t2d_genes.shape

(76, 3)

In [8]:
all_t2d_genes.head()

Unnamed: 0,gene_id,gene_name,type
0,ENSG00000006071,ABCC8,causal
1,ENSG00000167772,ANGPTL4,causal
2,ENSG00000154122,ANKH,causal
3,ENSG00000130203,APOE,causal
4,ENSG00000111276,CDKN1B,causal


# Load selected T2D traits

In [9]:
t2d_traits_filename = os.path.join(output_dir, 't2d_traits.pkl.xz')

In [10]:
diabetes_traits = pd.read_pickle(t2d_traits_filename).to_list()

In [11]:
diabetes_traits

['E11-Diagnoses_main_ICD10_E11_Noninsulindependent_diabetes_mellitus',
 '20002_1223-Noncancer_illness_code_selfreported_type_2_diabetes',
 'E14-Diagnoses_main_ICD10_E14_Unspecified_diabetes_mellitus',
 'E4_DM2NOCOMP-Type_2_diabetes_without_complications',
 'E4_DM2OPTH-Type_2_diabetes_with_ophthalmic_complications',
 'E4_DM2-Type_2_diabetes',
 'E4_DM2PERIPH-Type_2_diabetes_with_peripheral_circulatory_complications']

# Load S-MultiXcan results

In [12]:
smultixcan_zscores_filename = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-zscores.pkl.xz')
display(smultixcan_zscores_filename)

smultixcan_zscores = pd.read_pickle(smultixcan_zscores_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.pkl.xz'

In [13]:
smultixcan_zscores.shape

(22515, 4091)

In [14]:
smultixcan_zscores.head(5)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


# Read gwas2gene results

In [15]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [16]:
readRDS = robjects.r['readRDS']

In [17]:
n_expected_traits = len(diabetes_traits)

In [18]:
f_files = glob(os.path.join(conf.VALIDATION_TRAITS_T2D_BASE_DIR, 'gwas2gene', '*.rds'))
display(len(f_files))

if len(f_files) < n_expected_traits:
    print(f'WARNING: some files are not there. {n_expected_traits} expected, {len(f_files)} found.')

7

In [19]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = smultixcan_zscores.index.intersection(set(f_gene_list))

In [20]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

5260

5088

# Create classifier table

In [21]:
from entity import Trait

In [22]:
index_genes = []

for t in diabetes_traits:
    t_code = Trait(full_code=t).code
    if t_code not in gwas2genes_results:
        raise Exception(f'Not found: {t_code}')
    
    for g in gwas2genes_results[t_code]:
        index_genes.append(g)

In [23]:
index_genes = set(index_genes)

In [24]:
len(index_genes)

5088

In [25]:
classifier_index = pd.Index(
    index_genes
)

In [26]:
len(gwas2gene_all_genes)

5088

In [27]:
classifier_index.shape

(5088,)

In [28]:
predixcan_classifier_df = pd.DataFrame(index=classifier_index, columns=['score', 'predicted_class', 'true_class'])

In [29]:
predixcan_classifier_df = predixcan_classifier_df.sort_index()

In [30]:
predixcan_classifier_df.shape

(5088, 3)

In [31]:
predixcan_classifier_df['true_class'] = 0

In [32]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,,,0
ENSG00000000460,,,0
ENSG00000000938,,,0
ENSG00000000971,,,0
ENSG00000001460,,,0


In [33]:
predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(all_t2d_genes['gene_id']), 'true_class'] = 1

In [34]:
assert predixcan_classifier_df['true_class'].isna().sum() == 0

In [35]:
predixcan_classifier_df['true_class'].value_counts()

0    5068
1      20
Name: true_class, dtype: int64

In [36]:
# some testing
_tmp = predixcan_classifier_df[predixcan_classifier_df['true_class'] == 1]
display(_tmp.shape)
display(_tmp.head())
assert all([tg in all_t2d_genes['gene_id'].values for tg in _tmp.index])

(20, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000050820,,,1
ENSG00000073792,,,1
ENSG00000075035,,,1
ENSG00000113231,,,1
ENSG00000118971,,,1


In [37]:
# some testing
_tmp = predixcan_classifier_df[predixcan_classifier_df['true_class'] == 0]
display(_tmp.shape)
display(_tmp)
assert all([tg not in all_t2d_genes['gene_id'].values for tg in _tmp.index])

(5068, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,,,0
ENSG00000000460,,,0
ENSG00000000938,,,0
ENSG00000000971,,,0
ENSG00000001460,,,0
ENSG00000001461,,,0
ENSG00000003249,,,0
ENSG00000004399,,,0
ENSG00000004455,,,0
ENSG00000004487,,,0


In [38]:
len(gwas2gene_all_genes)

5088

In [39]:
df_score = pd.Series(
    index=classifier_index,
    data=smultixcan_zscores.loc[classifier_index, diabetes_traits].max(axis=1)
)

In [40]:
df_score.shape

(5088,)

In [41]:
df_score.head()

ENSG00000135740    1.852541
ENSG00000143006    1.294871
ENSG00000164117    1.181043
ENSG00000171503    3.738239
ENSG00000116353    2.612568
dtype: float64

In [42]:
# some testing
df_score = df_score.dropna().sort_index()
assert df_score.isna().sum().sum() == 0

In [43]:
df_score.shape

(5036,)

In [44]:
df_score.describe()

count    5036.000000
mean        1.671620
std         0.679521
min         0.151417
25%         1.225765
50%         1.570870
75%         2.007102
max         7.882385
dtype: float64

In [45]:
predixcan_classifier_df = predixcan_classifier_df.assign(score=df_score)

In [46]:
from scipy import stats

In [47]:
_n_genes = len(gwas2gene_all_genes)
display(_n_genes)

_n_ukb_traits = len(diabetes_traits)
display(_n_ukb_traits)

display(_n_genes * _n_ukb_traits)

PVALUE_THRESHOLD = (0.05 / (_n_genes * _n_ukb_traits))
display(PVALUE_THRESHOLD)

ZSCORE_THRESHOLD = np.abs(stats.norm.ppf(PVALUE_THRESHOLD / 2))
display(ZSCORE_THRESHOLD)

5088

7

35616

1.403863432165319e-06

4.8244553753610875

In [48]:
predixcan_classifier_df = predixcan_classifier_df.assign(predicted_class=(predixcan_classifier_df['score'] > ZSCORE_THRESHOLD).astype(int))

In [49]:
predixcan_classifier_df.shape

(5088, 3)

In [50]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,1.571204,0,0
ENSG00000000460,1.347714,0,0
ENSG00000000938,0.376593,0,0
ENSG00000000971,0.561353,0,0
ENSG00000001460,1.17634,0,0


## Some stats

In [51]:
_tmp = predixcan_classifier_df.sort_values('predicted_class', ascending=False)
display(_tmp.shape)
display(_tmp.head())

(5088, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000196735,5.039673,1,0
ENSG00000204304,5.357603,1,0
ENSG00000204421,5.530053,1,0
ENSG00000204420,5.019182,1,0
ENSG00000204410,5.733511,1,0


In [52]:
_tmp = predixcan_classifier_df.sort_values(['true_class'], ascending=False)
display(_tmp.shape)
display(_tmp[_tmp['true_class'] > 0].shape)
display(_tmp[_tmp['true_class'] > 0].head())

(5088, 3)

(20, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000075035,1.346615,0,1
ENSG00000152359,1.084058,0,1
ENSG00000148737,7.882385,1,1
ENSG00000132849,1.947622,0,1
ENSG00000163581,2.484417,0,1


In [53]:
# what kind of genes are we detecting in GWAS? (strong, weak, etc)
all_t2d_genes[all_t2d_genes['gene_id'].isin(predixcan_classifier_df.index)]

Unnamed: 0,gene_id,gene_name,type
10,ENSG00000135100,HNF1A,causal
18,ENSG00000166603,MC4R,causal
23,ENSG00000145730,PAM,causal
24,ENSG00000132849,PATJ,causal
28,ENSG00000152359,POC5,causal
33,ENSG00000164756,SLC30A8,causal
38,ENSG00000075035,WSCD2,causal
40,ENSG00000150967,ABCB9,strong
41,ENSG00000050820,BCAR1,strong
43,ENSG00000183049,CAMK1D,strong


In [54]:
predixcan_classifier_df.index.unique().shape

(5088,)

In [55]:
predixcan_classifier_df.shape

(5088, 3)

In [56]:
predixcan_classifier_df['predicted_class'].value_counts()

0    5061
1      27
Name: predicted_class, dtype: int64

In [57]:
predixcan_classifier_df['true_class'].value_counts()

0    5068
1      20
Name: true_class, dtype: int64

# Save classifier table

In [58]:
predixcan_classifier_df.shape

(5088, 3)

In [59]:
# remove nans
predixcan_classifier_df = predixcan_classifier_df.dropna()

In [60]:
predixcan_classifier_df.shape

(5036, 3)

In [61]:
assert predixcan_classifier_df.index.is_unique

In [62]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,1.571204,0,0
ENSG00000000460,1.347714,0,0
ENSG00000000938,0.376593,0,0
ENSG00000000971,0.561353,0,0
ENSG00000001460,1.17634,0,0


In [63]:
predixcan_classifier_df.to_csv(
    os.path.join(output_dir, 't2d-smultixcan-mashr-classifier_data.tsv.gz'),
    sep='\t', index=False
)