In [1]:
import pandas as pd
from indra.databases import hgnc_client
from protmapper import ProtMapper
from protmapper import uniprot_client
from requests.exceptions import HTTPError
import csv

In [2]:
df = pd.read_excel('data/MCF10A EGF TC LargevMini Comp 10_16.xlsx', index_col=0)

In [3]:
values_cols = ['default_126_sn_sum', 'default_127n_sn_sum', 'default_127c_sn_sum', 'default_128n_sn_sum',
               'default_128c_sn_sum', 'default_129n_sn_sum', 'default_129c_sn_sum', 'default_130n_sn_sum',
               'default_130c_sn_sum', 'default_131_sn_sum', 'default_131c_sn_sum',]
site_cols = ['Protein Id', 'gene_symbol', 'Site Position', 'Motif']

In [4]:
def normalize_gene_symbol(id_str, gene_name):
    hgnc_id = hgnc_client.get_current_hgnc_id(gene_name)
    if hgnc_id is None or isinstance(hgnc_id, list):
        id_str_up_id = id_str.split('|')[1].split('-')[0]
        up_gene_name = uniprot_client.get_gene_name(id_str_up_id)
        hgnc_id = hgnc_client.get_hgnc_id(up_gene_name)
    up_id = hgnc_client.get_uniprot_id(hgnc_id)
    if up_id:
        norm_sym = hgnc_client.get_hgnc_name(hgnc_id)
        return norm_sym
    else:
        return None

In [5]:
def normalize_motif(motif): 
    norm_s = motif.replace('x', '')
    offset = motif.find(norm_s)
    if offset == 0:
        pos = 7
    else:
        pos = 7 - offset
    return (norm_s, pos)

In [6]:
ok = []
ms_none = []
ms_diff = []
http_err = []
pm = ProtMapper()
cols = site_cols + values_cols
for row in df[cols].itertuples():
    site_id, id_str, gene, pos, motif = row[0:5]
    values = row[5:]
    if ';' in str(site_id):
        continue
    norm_sym = normalize_gene_symbol(id_str, gene)
    norm_motif, norm_pos = normalize_motif(motif)
    if norm_sym is None:
        continue
    try:
        ms = pm.map_peptide_to_human_ref(norm_sym, 'hgnc', norm_motif, norm_pos)
    except HTTPError as err:
        http_err.append((site_id, id_str, gene, pos, motif))
        continue
    if ms.mapped_pos is None:
        ms_none.append((site_id, id_str, gene, pos, motif))
        continue
    elif int(ms.mapped_pos) != int(pos):        
        ms_diff.append((site_id, id_str, gene, pos, motif))
        ok.append((site_id, id_str, gene, ms.mapped_pos, motif) + tuple(values))
    else:
        ok.append((site_id, id_str, gene, pos, motif) + tuple(values))

INFO: [2019-09-13 13:25:27] protmapper.uniprot_client - Loading Swissprot sequences...
INFO: [2019-09-13 13:25:29] protmapper.uniprot_client - Loading Uniprot isoform sequences...


In [14]:
header = ['idcol', 'symcol', 'sitescol', 'effectcol'] + values_cols
csv_rows = [header]
for site_row in ok:
    site_id, id_str, gene, pos, motif = site_row[0:5]
    values = list(site_row[5:])
    csv_row = ['%s|%s|%s|%s' % (site_id, id_str, gene, pos),
           gene, '%s%d' % (motif[6], int(pos)), ''] + values          
    csv_rows.append(csv_row)
with open('phosphosites.tsv', 'wt') as f:
    csvwriter = csv.writer(f, delimiter='\t')
    csvwriter.writerows(csv_rows)


In [8]:
%debug

> [0;32m<ipython-input-7-3a36b90ab305>[0m(7)[0;36m<module>[0;34m()[0m
[0;32m      5 [0;31m    [0mvalues[0m [0;34m=[0m [0mrow[0m[0;34m[[0m[0;36m5[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    row = ['%s|%s|%s|%s' % (site_id, id_str, gene, pos),
[0m[0;32m----> 7 [0;31m           gene, '%s%d' % (motif[6], int(pos)), ''] + values          
[0m[0;32m      8 [0;31m    [0mrows[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mrow[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      9 [0;31m[0;32mwith[0m [0mopen[0m[0;34m([0m[0;34m'phosphosites.tsv'[0m[0;34m,[0m [0;34m'wt'[0m[0;34m)[0m [0;32mas[0m [0mf[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> 
ipdb> site_id
'92211;92213;92212'
ipdb> id_str
'sp|O15066|KIF3B_HUMAN'
ipdb> gene
'KIF3B'
ipdb> pos
'728;729;738'
ipdb> exit
