In [None]:
import pandas as pd

from tqdm import tqdm_notebook as tqdm

## Read data

In [None]:
df_ppi = pd.read_table('data/9606.protein.links.v10.5.txt.gz', sep=' ')
df_map = pd.read_table(
    'data/9606.protein.aliases.v10.5.txt.gz', skiprows=1,
    header=None, names=['string_protein_id', 'alias', 'source'])

In [None]:
df_ppi.head()

In [None]:
df_map.head()

## ID mappings

### Map StringDB to Ensemble

In [None]:
df_map_ens = df_map[df_map['source']=='Ensembl']
string2ensemble = df_map_ens.set_index('string_protein_id').to_dict()['alias']

In [None]:
df_map_ens.head()

### Map Ensembl to Entrez

In [None]:
def split_and_expand(df, id_col, val_col, sep='; '):
    split_expanded = df[val_col].str.split(sep, expand=True)
    df_expanded = df.join(split_expanded)
    
    expanded_column_names = range(len(df_expanded.columns)-2)
    df_long = pd.melt(df_expanded, id_vars=id_col, value_vars=expanded_column_names).dropna()
    return df_long

In [None]:
df_uni = pd.read_table('data/HUMAN_9606_idmapping_selected.tab.gz', header=None)

In [None]:
df_uni_sub = df_uni[[2,18]]
df_uni_sub.columns = ['entrez', 'ensembl']
display(df_uni_sub.head())

df_uni_ens = split_and_expand(df_uni_sub, 'entrez', 'ensembl')
df_uni_ens[df_uni_ens.entrez=='7529']

In [None]:
df_uni_sub = df_uni[[2,19]]
df_uni_sub.columns = ['entrez', 'ensembl']
display(df_uni_sub.head())

df_uni_ens_trs = split_and_expand(df_uni_sub, 'entrez', 'ensembl')
df_uni_ens_trs[df_uni_ens_trs.entrez=='7529']

In [None]:
df_uni_sub = df_uni[[2,20]]
df_uni_sub.columns = ['entrez', 'ensembl']
display(df_uni_sub.head())

df_uni_ens_pro = split_and_expand(df_uni_sub, 'entrez', 'ensembl')
df_uni_ens_pro[df_uni_ens_pro.entrez=='7529']

In [None]:
df_uni_long = pd.concat([df_uni_ens, df_uni_ens_trs, df_uni_ens_pro], axis=0)
ensemble2entrez = df_uni_long.set_index('value').to_dict()['entrez']

## Convert stringDB

In [None]:
def convert(stringdb_id):
    ensembl_id = string2ensemble[stringdb_id]
    entrez_id = ensemble2entrez[ensembl_id]
    return entrez_id

In [None]:
converted_interactions = []
for row in tqdm(df_ppi.itertuples(), total=df_ppi.shape[0]):
    try:
        e1 = convert(row.protein1)
        e2 = convert(row.protein2)
        
        converted_interactions.append((e1, e2, row.combined_score))
    except KeyError:
        pass
df_conv = pd.DataFrame(converted_interactions, columns=['protein1', 'protein2', 'combined_score'])

In [None]:
df_conv.head()