In [None]:
import pandas as pd
import networkx as nx

from tqdm import tqdm_notebook as tqdm

## Helper functions

In [None]:
def split_and_expand(df, id_col, val_col, sep=','):
    """ Split dataframe cell along a separator and create an individual row per entry
    """
    df_tmp = df.reset_index(drop=True)
    
    split_expanded = df_tmp[val_col].str.split(sep, expand=True)
    df_expanded = df_tmp.join(split_expanded)

    expanded_column_names = range(len(df_expanded.columns)-2)
    df_long = pd.melt(
        df_expanded,
        id_vars=id_col, value_vars=expanded_column_names,
        value_name=val_col
    ).dropna().copy()
    df_long.drop('variable', axis=1, inplace=True)

    return df_long

split_and_expand(
    pd.DataFrame({
        'A': [1,2,3],
        'B': ['foo,bar','baz','qux,fubar,hui']
    }),
    'A', 'B'
)

## Read data

In [None]:
df_ppi = pd.read_table('data/9606.protein.links.v10.5.txt.gz', sep=' ')
df_map = pd.read_table(
    'data/9606.protein.aliases.v10.5.txt.gz', skiprows=1,
    header=None, names=['string_protein_id', 'alias', 'source'])

In [None]:
df_ppi.head()

In [None]:
df_map.head()

## ID mappings

### Map StringDB to Ensemble

In [None]:
df_map_ens = df_map[df_map['source']=='Ensembl'].dropna()
string2ensemble = df_map_ens.set_index('string_protein_id').to_dict()['alias']

In [None]:
df_map_str2ens = df_map_ens[['string_protein_id', 'alias']]
df_map_str2ens.columns = ('stringdb', 'ensembl')

df_map_str2ens.head()

### Map Ensembl to Uniprot

In [None]:
# available Ensembl prefixes
df_map_str2ens['ensembl'].str[:4].unique()

In [None]:
# Ensembl proteins
df_map_ens_prot = df_map_str2ens[df_map_str2ens['ensembl'].str.contains('ENSP')]
with open('results/ensembleids_prot.txt', 'w') as fd:
    fd.write('\n'.join(df_map_ens_prot['ensembl'].tolist()))
print(df_map_ens_prot.shape)
display(df_map_ens_prot.head(1))

# Ensembl transcripts
df_map_ens_trans = df_map_str2ens[df_map_str2ens['ensembl'].str.contains('ENST')]
with open('results/ensembleids_trans.txt', 'w') as fd:
    fd.write('\n'.join(df_map_ens_trans['ensembl'].tolist()))
print(df_map_ens_trans.shape)
display(df_map_ens_trans.head(1))

# split transcripts file, as uniprot only allows <2MB uploads
!split -b 1200000 results/ensembleids_trans.txt results/ensembleids_trans_sub_

# conversion with http://www.uniprot.org/uploadlists/
# EnsembleProtein: 65554/74741 mapped
# EnsembleTranscript: 85862/146549 mapped

In [None]:
# read in data from webquery
df_map_uni_prot = pd.read_table('results/ensemble_uniprot_map_prot.tsv.gz')
df_map_uni_trans_sub_aa = pd.read_table('results/ensemble_uniprot_map_trans_sub_aa.tsv.gz')
df_map_uni_trans_sub_ab = pd.read_table('results/ensemble_uniprot_map_trans_sub_ab.tsv.gz')

display(df_map_uni_prot.head(1))
display(df_map_uni_trans_sub_aa.head(1))
display(df_map_uni_trans_sub_ab.head(1))

In [None]:
df_map_uni_prot.columns = ('ensembl', 'uniprot')
df_map_uni_trans_sub_aa.columns = ('ensembl', 'uniprot')
df_map_uni_trans_sub_ab.columns = ('ensembl', 'uniprot')

df_map_ens2uni_pre = pd.concat([
    df_map_uni_prot,
    df_map_uni_trans_sub_aa, df_map_uni_trans_sub_ab
], axis=0)

# handle multiple Ensembl entries per row
df_map_ens2uni = split_and_expand(df_map_ens2uni_pre, 'uniprot', 'ensembl')

print(df_map_ens2uni.shape)
df_map_ens2uni.head()

### Map Uniprot to Entrez

In [None]:
with open('results/uniprot_prot.txt', 'w') as fd:
    fd.write('\n'.join(df_map_uni_prot['uniprot'].tolist()))
with open('results/uniprot_trans_sub_aa.txt', 'w') as fd:
    fd.write('\n'.join(df_map_uni_trans_sub_aa['uniprot'].tolist()))
with open('results/uniprot_trans_sub_ab.txt', 'w') as fd:
    fd.write('\n'.join(df_map_uni_trans_sub_ab['uniprot'].tolist()))


# conversion with http://www.uniprot.org/uploadlists/
# EnsembleProtein: 13265/52380 mapped to 11263 Entrez
# EnsembleTranscript: 25504/66976 mapped 23309 Entrez

In [None]:
# read in data from webquery
df_map_uni_entrez_prot = pd.read_table('results/uniprot_entrez_prot.tsv.gz')
df_map_uni_entrez_trans = pd.concat([
    pd.read_table('results/uniprot_entrez_trans_sub_aa.tsv.gz'),
    pd.read_table('results/uniprot_entrez_trans_sub_ab.tsv.gz')
], axis=0)

print(df_map_uni_entrez_prot.shape)
display(df_map_uni_entrez_prot.head(1))
print(df_map_uni_entrez_trans.shape)
display(df_map_uni_entrez_trans.head(1))

In [None]:
df_map_uni_entrez_prot.columns = ('uniprot', 'entrez')
df_map_uni_entrez_trans.columns = ('uniprot', 'entrez')

df_map_uni2ent = pd.concat([
    df_map_uni_entrez_prot, df_map_uni_entrez_trans
], axis=0)

print(df_map_uni2ent.shape)
df_map_uni2ent.head()

### Generate mapping dict

In [None]:
df_mapping = df_map_str2ens.merge(df_map_ens2uni, on='ensembl')
df_mapping = df_mapping.merge(df_map_uni2ent, on='uniprot')

# clean multi-entries
df_mapping['ensembl'] = df_mapping['ensembl'].str.split(',')
df_mapping['ensembl'] = df_mapping['ensembl'].apply(lambda x: x[0])

# remove duplicates
df_mapping.drop_duplicates(inplace=True)

# save result
df_mapping.to_csv('results/gene_id_mapping.tsv.gz', index=False, compression='gzip')
print(df_mapping.shape)
df_mapping.head()

In [None]:
gene_id_map = df_mapping.set_index('stringdb').to_dict()['entrez']
list(gene_id_map.items())[0]

## Convert stringDB

In [None]:
def convert(stringdb_id):
    return gene_id_map[stringdb_id]

In [None]:
converted_interactions = []
for row in tqdm(df_ppi.itertuples(), total=df_ppi.shape[0]):
    try:
        e1 = convert(row.protein1)
        e2 = convert(row.protein2)
        
        converted_interactions.append((e1, e2, row.combined_score))
    except KeyError:
        pass
df_conv = pd.DataFrame(converted_interactions, columns=['protein1', 'protein2', 'combined_score'])

In [None]:
df_conv.to_csv('data/stringdb_entrez.tsv.gz', sep='\t', index=False, compression='gzip')

In [None]:
print(f'StringDB shape conversion: {df_ppi.shape} -> {df_conv.shape}')
display(df_ppi.head())
display(df_conv.head())

## Network statistics

In [None]:
graph_orig = nx.convert_matrix.from_pandas_edgelist(
    df_ppi, source='protein1', target='protein2', edge_attr='combined_score')
graph_conv = nx.convert_matrix.from_pandas_edgelist(
    df_conv, source='protein1', target='protein2', edge_attr='combined_score')

In [None]:
def graph_info(name, graph):
    print(f'--- {name} ---')
    print(nx.info(graph))

In [None]:
graph_info('Original graph', graph_orig)
print()
graph_info('Converted graph', graph_conv)