# Preprocess PLIER pathway data and map gene symbols to Entrez IDs

In [59]:
import os
import urllib.request
import pandas as pd

import config as cfg
from scripts.symbol_to_entrez_id import symbol_to_entrez_id

## Get pathway .Rdata files from PLIER Github repo

#### These are originally derived from MSigDB.

In [88]:
if not os.path.exists(cfg.pathway_data):
    os.makedirs(cfg.pathway_data)
canonical_pathways = os.path.join(cfg.pathway_data, 'canonical_pathways.tsv')
oncogenic_pathways = os.path.join(cfg.pathway_data, 'oncogenic_pathways.tsv')
if (not os.path.exists(canonical_pathways)) or (not os.path.exists(oncogenic_pathways)):
    import subprocess
    r_script = os.path.join(os.getcwd(), 'scripts', 'get_plier_pathway_data.R')
    args = ['Rscript', r_script]
    subprocess.check_call(args)

In [89]:
canonical_df = pd.read_csv(canonical_pathways, sep='\t')
oncogenic_df = pd.read_csv(oncogenic_pathways, sep='\t')
first_cols = canonical_df.columns.values[0:5]
canonical_df.loc[:, first_cols].head()

Unnamed: 0,PID_CASPASE_PATHWAY,PID_P38ALPHABETADOWNSTREAMPATHWAY,REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS,REACTOME_REGULATION_OF_APOPTOSIS,REACTOME_IRON_UPTAKE_AND_TRANSPORT
UBE2Q1,0,0,0,0,0
UBE2Q2,0,0,0,0,0
PMM2,0,0,0,0,0
PMM1,0,0,0,0,0
NCBP1,0,0,0,0,0


In [90]:
first_cols = oncogenic_df.columns.values[0:5]
oncogenic_df.loc[:, first_cols].head()

Unnamed: 0,MYC_UP.V1_UP,PDGF_UP.V1_UP,BMI1_DN.V1_DN,SIRNA_EIF4GI_UP,CYCLIN_D1_KE_.V1_UP
RNF14,0,1,0,0,0
DUOXA1,0,0,0,0,0
RNF17,0,0,0,0,0
RNF10,0,0,0,0,0
RNF11,0,0,0,0,0


## Map canonical pathway data

In [91]:
gene_symbols = canonical_df.index.values
canonical_map = symbol_to_entrez_id(gene_symbols, verbose=True)
for i, (k, v) in enumerate(canonical_map.items()):
    if i >= 5: break
    print('{}\t{}'.format(k, v))

Querying for exact matches:
-- Matched 5832 of 6023 genes
Trying to manually map unmapped genes:
-- Matched 3 of 191 genes
Querying MyGene for aliases of 188 unmatched genes:
-- Found aliases for 187 of 188 genes
Querying for alias entrez IDs:
-- Matched 187 of 187 genes
RESULTS: matched 6022 of 6023 genes (0 duplicate Entrez IDs)
[]
ADA	100
CDH2	1000
AKT3	10000
MED6	10001
ACOT8	10005


In [92]:
canonical_df = canonical_df.assign(eid=canonical_df.index.map(canonical_map))
unmapped_genes = canonical_df[canonical_df['eid'] == 'N/A']
canonical_df = (
    canonical_df.dropna(subset=['eid'])
    .set_index('eid')
)
del canonical_df.index.name
canonical_df.iloc[:5, :5]

Unnamed: 0,PID_CASPASE_PATHWAY,PID_P38ALPHABETADOWNSTREAMPATHWAY,REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS,REACTOME_REGULATION_OF_APOPTOSIS,REACTOME_IRON_UPTAKE_AND_TRANSPORT
55585,0,0,0,0,0
92912,0,0,0,0,0
5373,0,0,0,0,0
5372,0,0,0,0,0
4686,0,0,0,0,0


In [94]:
print('Number of gene symbols not mapped: {} ({})'.format(
    len(unmapped_genes), ' '.join(unmapped_genes.index.values)))

Number of gene symbols not mapped: 1 (CD97)


In [19]:
canonical_df.to_csv(os.path.join(cfg.pathway_data, 'canonical_mapped2.tsv'),
                    sep='\t')

## Map oncogenic pathway data

In [20]:
gene_symbols = oncogenic_df.index.values
oncogenic_map = symbol_to_entrez_id(gene_symbols, verbose=True)
for i, (k, v) in enumerate(canonical_map.items()):
    if i >= 5: break
    print('{}\t{}'.format(k, v))

Querying for exact matches:
-- Matched 10288 of 11250 genes
Trying to manually map unmapped genes:
-- Matched 162 of 962 genes
Querying MyGene for aliases of 800 unmatched genes:
-- Found aliases for 707 of 800 genes
Querying for alias entrez IDs:
-- Matched 707 of 707 genes
RESULTS: matched 11157 of 11250 genes (0 duplicate Entrez IDs)
[]
ADA	100
CDH2	1000
AKT3	10000
MED6	10001
ACOT8	10005


In [21]:
oncogenic_df['eid'] = [oncogenic_map[g] for g in oncogenic_df.index.values]
oncogenic_df = oncogenic_df[oncogenic_df['eid'] != 'N/A']
oncogenic_df.set_index('eid', inplace=True)
del oncogenic_df.index.name
first_cols = oncogenic_df.columns.values[0:5]
oncogenic_df.loc[:, first_cols].head()

Unnamed: 0,MYC_UP.V1_UP,PDGF_UP.V1_UP,BMI1_DN.V1_DN,SIRNA_EIF4GI_UP,CYCLIN_D1_KE_.V1_UP
9604,0,1,0,0,0
90527,0,0,0,0,0
56163,0,0,0,0,0
9921,0,0,0,0,0
26994,0,0,0,0,0


In [22]:
oncogenic_df.to_csv(os.path.join(cfg.pathway_data, 'oncogenic_mapped.tsv'),
                    sep='\t')

## Generate randomized pathway data from set of all TCGA genes