In [None]:
import pandas as pd
import networkx as nx

from tqdm import tqdm_notebook as tqdm

from gene_map import GeneMapper

## Read data

In [None]:
df_ppi = pd.read_table('data/9606.protein.links.v10.5.txt.gz', sep=' ')
df_map = pd.read_table(
    'data/9606.protein.aliases.v10.5.txt.gz', skiprows=1,
    header=None, names=['string_protein_id', 'alias', 'source'])

In [None]:
df_ppi.head()

In [None]:
df_map.head()

## ID mappings

### Map StringDB to Ensembl

In [None]:
df_map_ens = df_map[df_map['source']=='Ensembl'].dropna()
string2ensemble = df_map_ens.set_index('string_protein_id').to_dict()['alias']

In [None]:
df_map_str2ens = df_map_ens[['string_protein_id', 'alias']]
df_map_str2ens.columns = ('stringdb', 'ensembl')

print(df_map_str2ens.shape)
df_map_str2ens.head()

### Map Ensembl to Entrez GeneID

In [None]:
gm = GeneMapper()

In [None]:
id_list = set(df_map_str2ens['ensembl'].tolist())

gm_res_pro = gm.query(id_list, source_id_type='Ensembl_PRO', target_id_type='GeneID')
gm_res_trs = gm.query(id_list, source_id_type='Ensembl_TRS', target_id_type='GeneID')
gm_res = pd.concat([gm_res_pro, gm_res_trs])

In [None]:
print(gm_res.shape)
gm_res.head()

### Generate mapping dict

In [None]:
df_mapping = df_map_str2ens.merge(gm_res, left_on='ensembl', right_on='ID_from')

# clean up
df_mapping.drop('ID_from', axis=1, inplace=True)
df_mapping.rename(columns={'ID_to': 'entrez'}, inplace=True)

# save result
df_mapping.to_csv('results/gene_id_mapping.tsv.gz', index=False, compression='gzip')
print(df_mapping.shape)
df_mapping.head()

In [None]:
gene_id_map = df_mapping.set_index('stringdb').to_dict()['entrez']
list(gene_id_map.items())[0]

## Convert stringDB

In [None]:
def convert(stringdb_id):
    return gene_id_map[stringdb_id]

In [None]:
converted_interactions = []
for row in tqdm(df_ppi.itertuples(), total=df_ppi.shape[0]):
    try:
        e1 = convert(row.protein1)
        e2 = convert(row.protein2)
        
        converted_interactions.append((e1, e2, row.combined_score))
    except KeyError:
        pass
df_conv = pd.DataFrame(converted_interactions, columns=['protein1', 'protein2', 'combined_score'])

In [None]:
df_conv.to_csv('data/stringdb_entrez.tsv.gz', sep='\t', index=False, compression='gzip')

In [None]:
print(f'StringDB shape conversion: {df_ppi.shape} -> {df_conv.shape}')
display(df_ppi.head())
display(df_conv.head())

## Network statistics

In [None]:
graph_orig = nx.convert_matrix.from_pandas_edgelist(
    df_ppi, source='protein1', target='protein2', edge_attr='combined_score')
graph_conv = nx.convert_matrix.from_pandas_edgelist(
    df_conv, source='protein1', target='protein2', edge_attr='combined_score')

In [None]:
def graph_info(name, graph):
    print(f'--- {name} ---')
    print(nx.info(graph))

In [None]:
graph_info('Original graph', graph_orig)
print()
graph_info('Converted graph', graph_conv)