Look for pathways (KEGG) associated with cancer and generate gene lists

In [4]:
import rnaseq_lib as r
from rnaseq_lib.jupyter import log_progress as prog
import pandas as pd
from Bio.KEGG.KGML import KGML_parser

## Pathway Table

In [5]:
operation='find'
queries= ['cancer', 'carcinoma', 'melanoma']
database='pathway'

In [6]:
text = ''
for query in queries:
    s = r.web.kegg._kegg_search(operation=operation,
                               database=database,
                               query=query)
    text += s.text

In [7]:
print text

path:map05200	Pathways in cancer
path:map05202	Transcriptional misregulation in cancer
path:map05205	Proteoglycans in cancer
path:map05206	MicroRNAs in cancer
path:map05210	Colorectal cancer
path:map05212	Pancreatic cancer
path:map05213	Endometrial cancer
path:map05215	Prostate cancer
path:map05216	Thyroid cancer
path:map05219	Bladder cancer
path:map05222	Small cell lung cancer
path:map05223	Non-small cell lung cancer
path:map05224	Breast cancer
path:map05226	Gastric cancer
path:map05230	Central carbon metabolism in cancer
path:map05231	Choline metabolism in cancer
path:map05211	Renal cell carcinoma
path:map05217	Basal cell carcinoma
path:map05225	Hepatocellular carcinoma
path:map05218	Melanoma



Convert to dataframe

In [230]:
records = [x.split('\t') for x in text.split('\n') if x]
path_df = pd.DataFrame.from_records(records, columns=['kegg', 'description'])

The `path:map` prefix _must be replaced with_ **hsa** (human sapiens) in order for downstream queries to work. 

In [231]:
path_df['kegg'] = path_df.kegg.str.replace('path:map', 'hsa')

Save

In [232]:
path_df.to_csv('tables/pathway-ids.tsv', sep='\t', index=None)

## Pathway Genes

Use Biopython's KGML parser

In [290]:
# For each pathway entry
for i, row in prog(path_df.iterrows(), every=1):
    
    # Separate kegg ID and description of pathway
    kegg, description = row
    print 'Finding genes for: {} - {}'.format(description, kegg)
    
    # Get pathway in KGML format
    kgml = _get(kegg, form='kgml').text
    
    # Wrap text in a file handle for KGML parser
    f = io.BytesIO(kgml.encode('utf-8'))
    k = KGML_parser.read(f)
    
    # Collect genes
    genes = set()
    for gene in k.genes:
        text = _get(gene.name).text
        for line in text.split('\n'):
            if line.startswith('NAME'):
                line = line.split()[1:]
                genes.add(line[0].rstrip(','))
                
    print '\tFound {} genes'.format(len(genes))
    
    # Output to file based on pathway description
    with open(os.path.join('gene-lists', '-'.join(description.split()) + '.txt'), 'w') as f:
        f.write('\n'.join(sorted(genes)))

Finding genes for: Pathways in cancer - hsa05200
	Found 462 genes
Finding genes for: Transcriptional misregulation in cancer - hsa05202
	Found 179 genes
Finding genes for: Proteoglycans in cancer - hsa05205
	Found 194 genes
Finding genes for: MicroRNAs in cancer - hsa05206
	Found 299 genes
Finding genes for: Colorectal cancer - hsa05210
	Found 72 genes
Finding genes for: Pancreatic cancer - hsa05212
	Found 75 genes
Finding genes for: Endometrial cancer - hsa05213
	Found 58 genes
Finding genes for: Prostate cancer - hsa05215
	Found 97 genes
Finding genes for: Thyroid cancer - hsa05216
	Found 37 genes
Finding genes for: Bladder cancer - hsa05219
	Found 41 genes
Finding genes for: Small cell lung cancer - hsa05222
	Found 84 genes
Finding genes for: Non-small cell lung cancer - hsa05223
	Found 66 genes
Finding genes for: Breast cancer - hsa05224
	Found 130 genes
Finding genes for: Gastric cancer - hsa05226
	Found 132 genes
Finding genes for: Central carbon metabolism in cancer - hsa05230
	