In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf

In [3]:
output_dir = os.path.join(conf.DELIVERABLES_DIR, 'roc_validation', 'classifier_tables', 't2d')
os.makedirs(output_dir, exist_ok=True)
display(output_dir)

'/mnt/phenomexcan_base/deliverables/roc_validation/classifier_tables/t2d'

# Load gene mappings

In [4]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

# Create list of genes related to T2D

Taken from: https://grants.nih.gov/grants/guide/rfa-files/RFA-DK-19-012.html

In [5]:
causal_genes = """
ABCC8, ANGPTL4, ANKH, APOE, CDKN1B, GCK, GCKR, GIPR, GLIS3, GLP1R, HNF1A, HNF1B,
HNF4A, IGF2, INS, IRS2, KCNJ11, LPL, MC4R, MNX1, MTNR1B, NEUROG3, NKX2-2, PAM,
PATJ, PAX4, PDX1, PLCB3, PNPLA3, POC5, PPARG, QSER1, RREB1, SLC16A11, SLC30A8,
SLC5A1, TBC1D4, TM6SF2, WFS1, WSCD2, ZNF771
""".replace(',', ' ').split()

assert len(causal_genes) == 41

causal_genes_ids = [genes_mapping_1[g] for g in causal_genes if g in genes_mapping_1]
display(len(causal_genes_ids))

40

In [6]:
strong_genes = """
ABCB9, BCAR1, C2CD4B, CAMK1D, CCND2, DGKB, INSR, IRS1, IRX3, IRX5,
KLF14, KLHL42, LMNA, SLC2A2, STARD10, TCF7L2, ZMIZ1
""".replace(',', ' ').split()

assert len(strong_genes) == 17

strong_genes_ids = [genes_mapping_1[g] for g in strong_genes if g in genes_mapping_1]
display(len(strong_genes_ids))

17

In [7]:
moderate_genes = """
ADCY5, AGPAT2, AGTR2, AP3S2, BCL11A, CISD2, FAM63A, FOXA2, GPSM1, IGF2BP2, JAZF1,
KCNK17, MACF1, MADD, NKX6-3, PDE8B, PLIN1, SGSM2, SPRY2, UBE2E2, VPS13C
""".replace(',', ' ').split()

assert len(moderate_genes) == 21

moderate_genes_ids = [genes_mapping_1[g] for g in moderate_genes if g in genes_mapping_1]
display(len(moderate_genes_ids))

19

In [8]:
possible_genes = """
ANK1, ASCC2, CALCOCO2, FADS1, HMG20A, IL17REL, MRPS30, PRC1, PTRF, SCD5, SNAPC4, ST6GAL1, TP53INP1
""".replace(',', ' ').split()

assert len(possible_genes) == 13

possible_genes_ids = [genes_mapping_1[g] for g in possible_genes if g in genes_mapping_1]
display(len(possible_genes_ids))

13

In [9]:
weak_genes = """
ABO, CARD9, CDK2AP1, CTNNAL1, DNZL, ITGB6
""".replace(',', ' ').split()

assert len(weak_genes) == 6

weak_genes_ids = [genes_mapping_1[g] for g in weak_genes if g in genes_mapping_1]
display(len(weak_genes_ids))

5

In [10]:
related_traits_genes = """
ADRA2A, AKT2, APPL1, BLK, BSCL2, CAV1, CEL, EIF2AK3, ERAP2, FOXP3, G6PC2, G6PD,
GATA4, GATA6, GCG, GRB10, IER3IP1, IGF1, KLF11, NAT2, NEUROD1, PAX6, PCBD1,
PCSK1, POLD1, PPP1R15B, PTF1A, RFX6, SIX2, SIX3, SLC19A2, TRMT10A, WARS, ZFP57
""".replace(',', ' ').split()

assert len(related_traits_genes) == 34

related_traits_genes_ids = [genes_mapping_1[g] for g in related_traits_genes if g in genes_mapping_1]
display(len(related_traits_genes_ids))

31

### All T2D genes together

In [11]:
all_t2d_genes = pd.DataFrame(
    {
        'gene_id': (
            causal_genes_ids + strong_genes_ids + moderate_genes_ids +
            possible_genes_ids + weak_genes_ids + related_traits_genes_ids
        ),
    }
)

In [12]:
all_t2d_genes = all_t2d_genes.assign(gene_name=all_t2d_genes['gene_id'].apply(lambda x: genes_mapping_0[x]))

In [13]:
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(causal_genes_ids), 'type'] = 'causal'
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(strong_genes_ids), 'type'] = 'strong'
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(moderate_genes_ids), 'type'] = 'moderate'
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(possible_genes_ids), 'type'] = 'possible'
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(weak_genes_ids), 'type'] = 'weak'
all_t2d_genes.loc[all_t2d_genes['gene_id'].isin(related_traits_genes_ids), 'type'] = 'related_traits'

In [14]:
_tmp = all_t2d_genes['type'].value_counts()
display(_tmp)
assert _tmp['causal'] == 40
assert _tmp['strong'] == 17
assert _tmp['moderate'] == 19
assert _tmp['possible'] == 13
assert _tmp['weak'] == 5
assert _tmp['related_traits'] == 31

causal            40
related_traits    31
moderate          19
strong            17
possible          13
weak               5
Name: type, dtype: int64

In [15]:
all_t2d_genes.shape

(125, 3)

In [16]:
all_t2d_genes.head()

Unnamed: 0,gene_id,gene_name,type
0,ENSG00000006071,ABCC8,causal
1,ENSG00000167772,ANGPTL4,causal
2,ENSG00000154122,ANKH,causal
3,ENSG00000130203,APOE,causal
4,ENSG00000111276,CDKN1B,causal


# Keep only important genes

In [17]:
all_t2d_genes = all_t2d_genes[all_t2d_genes['type'].isin(('causal', 'strong', 'moderate'))]
display(all_t2d_genes.shape)

(76, 3)

## Save

### Internal

In [18]:
t2d_genes_filename = os.path.join(output_dir, 't2d_genes.pkl.xz')
display(t2d_genes_filename)

'/mnt/phenomexcan_base/deliverables/roc_validation/classifier_tables/t2d/t2d_genes.pkl.xz'

In [19]:
all_t2d_genes.to_pickle(t2d_genes_filename)

### Publishable

In [20]:
t2d_genes_filename = os.path.join(output_dir, 't2d_genes.tsv.gz')
display(t2d_genes_filename)

'/mnt/phenomexcan_base/deliverables/roc_validation/classifier_tables/t2d/t2d_genes.tsv.gz'

In [21]:
all_t2d_genes.to_csv(t2d_genes_filename, sep='\t', index=False)