# RSEM Mapping
Subset a table of quantification values, output by RSEM, from Gencode/Ensemble names to HUGO

In [2]:
import os
import pandas as pd
import itertools
import random

In [3]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [4]:
tab_paths, results_paths = [], []
for root, sub, files in os.walk('.'):
    for f in files:
        if 'gene' in f:
            tab_paths.append(os.path.join(root, f))
        elif 'isoform' in f:
            results_paths.append(os.path.join(root, f))

In [5]:
id_map = pd.read_table('attrs.tsv', sep='\t')

In [6]:
id_map.head()

Unnamed: 0,geneId,geneName,geneType,geneStatus,transcriptId,transcriptName,transcriptType,transcriptStatus,havanaGeneId,havanaTranscriptId,ccdsId,level,transcriptClass
0,ENSG00000223972.5,DDX11L1,transcribed_unprocessed_pseudogene,KNOWN,ENST00000456328.2,DDX11L1-002,processed_transcript,KNOWN,OTTHUMG00000000961.2,OTTHUMT00000362751.1,,2,pseudo
1,ENSG00000223972.5,DDX11L1,transcribed_unprocessed_pseudogene,KNOWN,ENST00000450305.2,DDX11L1-001,transcribed_unprocessed_pseudogene,KNOWN,OTTHUMG00000000961.2,OTTHUMT00000002844.2,,2,pseudo
2,ENSG00000227232.5,WASH7P,unprocessed_pseudogene,KNOWN,ENST00000488147.1,WASH7P-001,unprocessed_pseudogene,KNOWN,OTTHUMG00000000958.1,OTTHUMT00000002839.1,,2,pseudo
3,ENSG00000278267.1,MIR6859-1,miRNA,KNOWN,ENST00000619216.1,MIR6859-1-201,miRNA,KNOWN,,,,3,nonCoding
4,ENSG00000243485.3,RP11-34P13.3,lincRNA,KNOWN,ENST00000473358.1,RP11-34P13.3-001,lincRNA,KNOWN,OTTHUMG00000000959.2,OTTHUMT00000002840.1,,2,nonCoding


In [5]:
gene_mappings = {x: y for x, y in itertools.izip(id_map['geneId'], id_map['geneName'])}
isoform_mappings = {x: y for x, y in itertools.izip(id_map['transcriptId'], id_map['transcriptName'])}

In [6]:
genes = {x: pd.read_table(x, sep='\t') for x in tab_paths }
isoforms = {x: pd.read_table(x, sep='\t') for x in results_paths}

In [7]:
genes[random.choice(genes.keys())].head()

Unnamed: 0,gene_id,01563ead-2437-4282-81b1-f0db63d72e9c
0,ENSG00000000003.14,0.0
1,ENSG00000000005.5,0.0
2,ENSG00000000419.12,1435.1225
3,ENSG00000000457.13,180.8233
4,ENSG00000000460.16,150.0782


In [8]:
isoforms[random.choice(isoforms.keys())].head()

Unnamed: 0,gene_id/transcript_id,01563ead-2437-4282-81b1-f0db63d72e9c
0,ENSG00000000003.14/ENST00000373020.8,0
1,ENSG00000000003.14/ENST00000494424.1,0
2,ENSG00000000003.14/ENST00000496771.5,0
3,ENSG00000000003.14/ENST00000612152.4,0
4,ENSG00000000003.14/ENST00000614008.4,0


In [9]:
def replace_gene_names(table, gene_mappings):
    gene_names = []
    keyerrors = 0.0
    for gene_id in table['gene_id']:
        try:    
            gene_names.append(gene_mappings[gene_id])
        except KeyError:
            keyerrors += 1
            # gene_names.append(gene_id)
            table.drop(table[table['gene_id'] == gene_id].index, inplace=True)
    print "Number of unmapped genes: {}, of {} total genes.".format(keyerrors, len(table['gene_id']))
    print "{}% of genes succesfully mapped.".format(100*round(1-(keyerrors/len(table['gene_id'])),4))
    table['gene_id'] = gene_names
    return table

In [10]:
def replace_isoform_names(table, isoform_mappings, tabs=True):
    isoform_names = []
    keyerrors = 0.0
    name = 'gene_id/transcript_id' if tabs else 'transcript_id'
    for transcript_id in table[name]:
        try:    
            if tabs:
                isoform_names.append(isoform_mappings[transcript_id.split('/')[1]])
            else:
                isoform_names.append(isoform_mappings[transcript_id])
        except KeyError:
            keyerrors += 1
            # isoform_names.append(transcript_id)
            table.drop(table[table[name]==transcript_id].index, inplace=True)
    print "Number of unmapped isoforms: {}, of {} total genes.".format(keyerrors, len(table[name]))
    print "{}% of isoforms succesfully mapped.".format(100*round(1-(keyerrors/len(table[name])),4))
    table[name] = isoform_names
    return table

In [11]:
for gene in genes:
    replace_gene_names(genes[gene], gene_mappings)

Number of unmapped genes: 50.0, of 60448 total genes.
99.92% of genes succesfully mapped.
Number of unmapped genes: 50.0, of 60448 total genes.
99.92% of genes succesfully mapped.
Number of unmapped genes: 50.0, of 60448 total genes.
99.92% of genes succesfully mapped.
Number of unmapped genes: 50.0, of 60448 total genes.
99.92% of genes succesfully mapped.
Number of unmapped genes: 50.0, of 60448 total genes.
99.92% of genes succesfully mapped.


In [12]:
for isoform in isoforms:
    if '.tab' in isoform:
        replace_isoform_names(isoforms[isoform], isoform_mappings)
    else:
        replace_isoform_names(isoforms[isoform], isoform_mappings, tabs=False)

Number of unmapped isoforms: 164.0, of 198455 total genes.
99.92% of isoforms succesfully mapped.
Number of unmapped isoforms: 164.0, of 198455 total genes.
99.92% of isoforms succesfully mapped.
Number of unmapped isoforms: 164.0, of 198455 total genes.
99.92% of isoforms succesfully mapped.
Number of unmapped isoforms: 164.0, of 198455 total genes.
99.92% of isoforms succesfully mapped.
Number of unmapped isoforms: 164.0, of 198455 total genes.
99.92% of isoforms succesfully mapped.


In [13]:
# os.mkdir(os.path.join(os.path.split(random.choice(genes.keys()))[0], 'HUGO_names'))
for gene in genes:
    fpath = os.path.join(os.path.splitext(gene)[0] + '.HUGO' + os.path.splitext(gene)[1])
    print fpath
    genes[gene].to_csv(fpath, sep='\t', index=False)

./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.genes.norm_fpkm.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.genes.norm_tpm.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem_genes.HUGO.results
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.genes.norm_counts.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.genes.raw_counts.HUGO.tab


In [14]:
for isoform in isoforms:
    fpath = os.path.join(os.path.splitext(isoform)[0] + '.HUGO' + os.path.splitext(isoform)[1])
    print fpath
    isoforms[isoform].to_csv(fpath, sep='\t', index=False)

./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.isoform.raw_counts.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.isoform.norm_counts.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.isoform.norm_tpm.HUGO.tab
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem_isoforms.HUGO.results
./01563ead-2437-4282-81b1-f0db63d72e9c/RSEM/01563ead-2437-4282-81b1-f0db63d72e9c.rsem.isoform.norm_fpkm.HUGO.tab
