In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf

In [3]:
output_dir = os.path.join(conf.DELIVERABLES_DIR, 'roc_validation', 'classifier_tables', 't2d')
os.makedirs(output_dir, exist_ok=True)
display(output_dir)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/deliverables/roc_validation/classifier_tables/t2d'

# Load gene mappings

In [4]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Load T2D genes

In [5]:
t2d_genes_filename = os.path.join(output_dir, 't2d_genes.pkl.xz')
display(t2d_genes_filename)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/deliverables/roc_validation/classifier_tables/t2d/t2d_genes.pkl.xz'

In [6]:
all_t2d_genes = pd.read_pickle(t2d_genes_filename)

In [7]:
all_t2d_genes.shape

(76, 3)

In [8]:
all_t2d_genes.head()

Unnamed: 0,gene_id,gene_name,type
0,ENSG00000006071,ABCC8,causal
1,ENSG00000167772,ANGPTL4,causal
2,ENSG00000154122,ANKH,causal
3,ENSG00000130203,APOE,causal
4,ENSG00000111276,CDKN1B,causal


# Load selected T2D traits

In [9]:
t2d_traits_filename = os.path.join(output_dir, 't2d_traits.pkl.xz')

In [10]:
diabetes_traits = pd.read_pickle(t2d_traits_filename).to_list()

In [11]:
diabetes_traits

['E11-Diagnoses_main_ICD10_E11_Noninsulindependent_diabetes_mellitus',
 '20002_1223-Noncancer_illness_code_selfreported_type_2_diabetes',
 'E14-Diagnoses_main_ICD10_E14_Unspecified_diabetes_mellitus',
 'E4_DM2NOCOMP-Type_2_diabetes_without_complications',
 'E4_DM2OPTH-Type_2_diabetes_with_ophthalmic_complications',
 'E4_DM2-Type_2_diabetes',
 'E4_DM2PERIPH-Type_2_diabetes_with_peripheral_circulatory_complications']

# Load fastENLOC results

In [12]:
smultixcan_zscores_filename = os.path.join(conf.GENE_ASSOC_DIR, 'fastenloc-torus-rcp.pkl.xz')
display(smultixcan_zscores_filename)

smultixcan_zscores = pd.read_pickle(smultixcan_zscores_filename)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [13]:
smultixcan_zscores.shape

(38062, 4091)

In [14]:
smultixcan_zscores.head(5)

Unnamed: 0_level_0,O46-Diagnoses_main_ICD10_O46_Antepartum_haemorrhage_not_elsewhere_classified,K30-Diagnoses_main_ICD10_K30_Dyspepsia,2907-Ever_stopped_smoking_for_6_months,H7_DIPLOPIA-Diplopia,1538_0-Major_dietary_changes_in_the_last_5_years_No,5663-Length_of_longest_manicirritable_episode,20002_1538-Noncancer_illness_code_selfreported_arthritis_nos,S30-Diagnoses_main_ICD10_S30_Superficial_injury_of_abdomen_lower_back_and_pelvis,24010_raw-Inverse_distance_to_the_nearest_road,3143_raw-Ankle_spacing_width,...,2237-Plays_computer_games,20002_1461-Noncancer_illness_code_selfreported_inflammatory_bowel_disease,20002_1508-Noncancer_illness_code_selfreported_jaundice_unknown_cause,20003_1140881882-Treatmentmedication_code_timoptol_025_eye_drops,22601_71253330-Job_coding_merchandiser_window_dresser,23112_raw-Leg_fat_mass_right,20003_1140861778-Treatmentmedication_code_dipyridamole,20003_1199-Treatmentmedication_code_food_supplementplantherbal_extract,1309-Fresh_fruit_intake,100920_2105-Type_milk_consumed_soya_with_calcium
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,0.001213,,,,,,...,,,,,,,,,0.002131,
ENSG00000000457,,,,,0.001873,,,,,0.01772,...,0.068852,,,,,0.003383,,,0.009195,
ENSG00000000460,,,,,0.00262,,,0.000207,,0.000105,...,0.093284,,,,,0.00879,,,0.003105,
ENSG00000000938,,,,,0.002928,,,,0.000762,0.012773,...,0.0043,,,0.000424,,0.000612,,,0.00324,
ENSG00000000971,,,,,0.002858,,,,,0.019304,...,0.005419,,,,,0.007427,,,0.004804,


# Read gwas2gene results

In [15]:
from glob import glob

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [16]:
readRDS = robjects.r['readRDS']

In [17]:
n_expected_traits = len(diabetes_traits)

In [18]:
f_files = glob(os.path.join(conf.BASE_DIR, 'results', 't2d', 'gwas2gene', '*.rds'))
display(len(f_files))

if len(f_files) < n_expected_traits:
    print(f'WARNING: some files are not there. {n_expected_traits} expected, {len(f_files)} found.')

10

In [19]:
gwas2genes_results = {}

for f in f_files:
    f_base = os.path.basename(f)
    f_code = f_base.split('.')[0]
    
    #print(f_base)
    rds_contents = readRDS(f)
    
    if len(rds_contents[1]) > 0:
        f_gene_list = list(rds_contents[1][0].iter_labels())
    else:
        print(f'{f_code}: empty')
        f_gene_list = []
    
    gwas2genes_results[f_code] = smultixcan_zscores.index.intersection(set(f_gene_list))

In [20]:
gwas2gene_all_genes = []

for k in gwas2genes_results.keys():
    gwas2gene_all_genes.extend(gwas2genes_results[k])

display(len(gwas2gene_all_genes))

gwas2gene_all_genes = set(gwas2gene_all_genes)
display(len(gwas2gene_all_genes))

30238

13785

# Create classifier table

In [21]:
from entity import Trait

In [22]:
index_genes = []

for t in diabetes_traits:
    t_code = Trait(full_code=t).code
    if t_code not in gwas2genes_results:
        raise Exception(f'Not found: {t_code}')
    
    for g in gwas2genes_results[t_code]:
        index_genes.append(g)

In [23]:
index_genes = set(index_genes)

In [24]:
len(index_genes)

5183

In [25]:
classifier_index = pd.Index(
    index_genes
)

In [26]:
len(gwas2gene_all_genes)

13785

In [27]:
classifier_index.shape

(5183,)

In [28]:
predixcan_classifier_df = pd.DataFrame(index=classifier_index, columns=['score', 'predicted_class', 'true_class'])

In [29]:
predixcan_classifier_df = predixcan_classifier_df.sort_index()

In [30]:
predixcan_classifier_df.shape

(5183, 3)

In [31]:
predixcan_classifier_df['true_class'] = 0

In [32]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,,,0
ENSG00000000460,,,0
ENSG00000000938,,,0
ENSG00000000971,,,0
ENSG00000001460,,,0


In [33]:
predixcan_classifier_df.loc[predixcan_classifier_df.index.intersection(all_t2d_genes['gene_id']), 'true_class'] = 1

In [34]:
assert predixcan_classifier_df['true_class'].isna().sum() == 0

In [35]:
predixcan_classifier_df['true_class'].value_counts()

0    5163
1      20
Name: true_class, dtype: int64

In [36]:
# some testing
_tmp = predixcan_classifier_df[predixcan_classifier_df['true_class'] == 1]
display(_tmp.shape)
display(_tmp.head())
assert all([tg in all_t2d_genes['gene_id'].values for tg in _tmp.index])

(20, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000050820,,,1
ENSG00000073792,,,1
ENSG00000075035,,,1
ENSG00000113231,,,1
ENSG00000118971,,,1


In [37]:
# some testing
_tmp = predixcan_classifier_df[predixcan_classifier_df['true_class'] == 0]
display(_tmp.shape)
display(_tmp)
assert all([tg not in all_t2d_genes['gene_id'].values for tg in _tmp.index])

(5163, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,,,0
ENSG00000000460,,,0
ENSG00000000938,,,0
ENSG00000000971,,,0
ENSG00000001460,,,0
...,...,...,...
ENSG00000283992,,,0
ENSG00000284188,,,0
ENSG00000284413,,,0
ENSG00000284484,,,0


In [38]:
len(gwas2gene_all_genes)

13785

In [39]:
df_score = pd.Series(
    index=classifier_index,
    data=smultixcan_zscores.loc[classifier_index, diabetes_traits].max(axis=1)
)

In [40]:
df_score.shape

(5183,)

In [41]:
df_score.head()

ENSG00000178053    0.000451
ENSG00000172155         NaN
ENSG00000145113    0.000345
ENSG00000163554    0.000224
ENSG00000173093    0.000156
dtype: float64

In [42]:
# some testing
df_score = df_score.fillna(0.0).sort_index()
assert df_score.isna().sum().sum() == 0

In [43]:
df_score.shape

(5183,)

In [44]:
df_score.describe()

count    5183.000000
mean        0.001090
std         0.018922
min         0.000000
25%         0.000102
50%         0.000195
75%         0.000413
max         0.888500
dtype: float64

In [45]:
predixcan_classifier_df = predixcan_classifier_df.assign(score=df_score)

In [46]:
from scipy import stats

In [47]:
_n_genes = len(gwas2gene_all_genes)
display(_n_genes)

SCORE_THRESHOLD = 0.1
display(SCORE_THRESHOLD)

13785

0.1

In [48]:
predixcan_classifier_df = predixcan_classifier_df.assign(predicted_class=(predixcan_classifier_df['score'] > SCORE_THRESHOLD).astype(int))

In [49]:
predixcan_classifier_df.shape

(5183, 3)

In [50]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,0.000324,0,0
ENSG00000000460,0.00035,0,0
ENSG00000000938,0.000265,0,0
ENSG00000000971,0.0,0,0
ENSG00000001460,0.000339,0,0


## Some stats

In [51]:
_tmp = predixcan_classifier_df.sort_values('predicted_class', ascending=False)
display(_tmp.shape)
display(_tmp.head())

(5183, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000164252,0.7125,1,0
ENSG00000183049,0.3048,1,1
ENSG00000232629,0.8885,1,0
ENSG00000204531,0.612837,1,0
ENSG00000179344,0.192411,1,0


In [52]:
_tmp = predixcan_classifier_df.sort_values(['true_class'], ascending=False)
display(_tmp.shape)
display(_tmp[_tmp['true_class'] > 0].shape)
display(_tmp[_tmp['true_class'] > 0].head())

(5183, 3)

(20, 3)

Unnamed: 0,score,predicted_class,true_class
ENSG00000073792,0.002912,0,1
ENSG00000183049,0.3048,1,1
ENSG00000164756,0.0,0,1
ENSG00000127603,0.000772,0,1
ENSG00000118971,0.0,0,1


In [53]:
# what kind of genes are we detecting in GWAS? (strong, weak, etc)
all_t2d_genes[all_t2d_genes['gene_id'].isin(predixcan_classifier_df.index)]

Unnamed: 0,gene_id,gene_name,type
10,ENSG00000135100,HNF1A,causal
18,ENSG00000166603,MC4R,causal
23,ENSG00000145730,PAM,causal
24,ENSG00000132849,PATJ,causal
28,ENSG00000152359,POC5,causal
33,ENSG00000164756,SLC30A8,causal
38,ENSG00000075035,WSCD2,causal
40,ENSG00000150967,ABCB9,strong
41,ENSG00000050820,BCAR1,strong
43,ENSG00000183049,CAMK1D,strong


In [54]:
predixcan_classifier_df.index.unique().shape

(5183,)

In [55]:
predixcan_classifier_df.shape

(5183, 3)

In [56]:
predixcan_classifier_df['predicted_class'].value_counts()

0    5176
1       7
Name: predicted_class, dtype: int64

In [57]:
predixcan_classifier_df['true_class'].value_counts()

0    5163
1      20
Name: true_class, dtype: int64

# Save classifier table

In [58]:
predixcan_classifier_df.shape

(5183, 3)

In [59]:
# remove nans
predixcan_classifier_df = predixcan_classifier_df.dropna()

In [60]:
predixcan_classifier_df.shape

(5183, 3)

In [61]:
assert predixcan_classifier_df.index.is_unique

In [62]:
predixcan_classifier_df.head()

Unnamed: 0,score,predicted_class,true_class
ENSG00000000457,0.000324,0,0
ENSG00000000460,0.00035,0,0
ENSG00000000938,0.000265,0,0
ENSG00000000971,0.0,0,0
ENSG00000001460,0.000339,0,0


In [63]:
predixcan_classifier_df.to_csv(
    os.path.join(output_dir, 't2d-fastenloc-torus-classifier_data.tsv.gz'),
    sep='\t', index=False
)