# Preprocess PLIER pathway data and map gene symbols to Entrez IDs

In [3]:
import os
import urllib.request
import pandas as pd

import config as cfg
from scripts.symbol_to_entrez_id import symbol_to_entrez_id

#### Get pathway .Rdata files from PLIER Github repo. These are derived from MSigDB. (TODO more detail)

In [7]:
if not os.path.exists(cfg.pathway_data):
    os.makedirs(cfg.pathway_data)
canonical_pathways = os.path.join(cfg.pathway_data, 'canonical_pathways.tsv')
oncogenic_pathways = os.path.join(cfg.pathway_data, 'oncogenic_pathways.tsv')
if (not os.path.exists(canonical_pathways)) or (not os.path.exists(oncogenic_pathways)):
    import subprocess
    r_script = os.path.join(os.getcwd(), 'scripts', 'get_plier_pathway_data.R')
    args = ['Rscript', r_script]
    subprocess.check_call(args)

In [8]:
canonical_df = pd.read_csv(canonical_pathways, sep='\t')
oncogenic_df = pd.read_csv(oncogenic_pathways, sep='\t')
first_cols = canonical_df.columns.values[0:5]
canonical_df.loc[:, first_cols].head()

Unnamed: 0,PID_CASPASE_PATHWAY,PID_P38ALPHABETADOWNSTREAMPATHWAY,REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS,REACTOME_REGULATION_OF_APOPTOSIS,REACTOME_IRON_UPTAKE_AND_TRANSPORT
UBE2Q1,0,0,0,0,0
UBE2Q2,0,0,0,0,0
PMM2,0,0,0,0,0
PMM1,0,0,0,0,0
NCBP1,0,0,0,0,0


In [9]:
first_cols = oncogenic_df.columns.values[0:5]
oncogenic_df.loc[:, first_cols].head()

Unnamed: 0,MYC_UP.V1_UP,PDGF_UP.V1_UP,BMI1_DN.V1_DN,SIRNA_EIF4GI_UP,CYCLIN_D1_KE_.V1_UP
RNF14,0,1,0,0,0
DUOXA1,0,0,0,0,0
RNF17,0,0,0,0,0
RNF10,0,0,0,0,0
RNF11,0,0,0,0,0


#### Map gene symbols to Entrez IDs, for compatibility with TCGA expression data

In [10]:
gene_symbols = canonical_df.index.values
canonical_map = symbol_to_entrez_id(gene_symbols, verbose=True)
for i, (k, v) in enumerate(canonical_map.items()):
    if i >= 5: break
    print('{}\t{}'.format(k, v))

Querying for exact matches:
-- Matched 5850 of 6023 genes
Trying to manually map unmapped genes:
-- Matched 3 of 173 genes
Querying MyGene for aliases of 170 unmatched genes:
-- Found aliases for 169 of 170 genes
Querying for alias entrez IDs:
-- Matched 169 of 169 genes
RESULTS: matched 6022 of 6023 genes (0 duplicate Entrez IDs)
ADA	100
CDH2	1000
AKT3	10000
MED6	10001
ACOT8	10005


In [11]:
canonical_df['eid'] = [canonical_map[g] for g in canonical_df.index.values]
canonical_df = canonical_df[canonical_df['eid'] != 'N/A']
canonical_df.set_index('eid', inplace=True)
del canonical_df.index.name
first_cols = canonical_df.columns.values[0:5]
canonical_df.loc[:, first_cols].head()

Unnamed: 0,PID_CASPASE_PATHWAY,PID_P38ALPHABETADOWNSTREAMPATHWAY,REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS,REACTOME_REGULATION_OF_APOPTOSIS,REACTOME_IRON_UPTAKE_AND_TRANSPORT
55585,0,0,0,0,0
92912,0,0,0,0,0
5373,0,0,0,0,0
5372,0,0,0,0,0
4686,0,0,0,0,0


In [12]:
print('Number of gene symbols not mapped: {}'.format(
    len(gene_symbols) - len(canonical_df)))
print(len(gene_symbols))
print(len(canonical_df))

Number of gene symbols not mapped: 1
6023
6022


In [13]:
canonical_df.to_csv(os.path.join(cfg.pathway_data, 'canonical_mapped.tsv'),
                    sep='\t')

#### TODO: map oncogenic pathway data