In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
import time

## Load InBio_Map Raw Data
#### Source: https://www.intomics.com/inbio/map/#downloads
Downloaded: November 30, 2016  
Last Updated: September 12, 2016   
Note about scoring: According to the supplement of the associated paper (Li T, et al. A scored human protein–protein interaction network to catalyze genomic interpretation. Nature Methods 14, 61–64 (2017) doi:10.1038/nmeth.4083), column 15 (index=14) should correspond to the confidence score of the edge. This column has 2 values, the confidence score and initial score. We will use the confidence score as it is a corrected version of the initial score calculated, indicating confidence that a particular interaction is real.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
InBio_Map_Raw = pd.read_csv(wd+'Network_Data_Raw/InBio_Map_core_2016_09_12/core.psimitab',sep='\t', header=-1)
print 'Raw edge count in InBio_Map:', len(InBio_Map_Raw)

Raw edge count in InBio_Map: 625641


In [3]:
InBio_Map_Human_Only = InBio_Map_Raw[(InBio_Map_Raw[9]=='taxid:9606(Homo sapiens)') & (InBio_Map_Raw[10]=='taxid:9606(Homo sapiens)')]
print 'Human-Human only interactions in InBioMap:', len(InBio_Map_Human_Only)

Human-Human only interactions in InBioMap: 625641


In [4]:
# Extract gene list
InBio_Map_Human_Genes = list(set(InBio_Map_Human_Only[0]).union(set(InBio_Map_Human_Only[1])))
InBio_Map_Human_Genes = [str(gene) for gene in InBio_Map_Human_Genes]

## Convert Genes

In [5]:
# Construct list of genes to be submitted to MyGene.Info API
query_string, valid_genes, invalid_genes = gct.query_constructor(InBio_Map_Human_Genes)

17653 Valid Query Genes
0 Invalid Query Genes


In [6]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [7]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 39.84 seconds
17984 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 419

233 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 76.78 seconds


## Construct Converted Network

In [9]:
query_edgelist = InBio_Map_Human_Only[[0, 1, 14]].values.tolist()
query_edgelist_fmt = [[edge[0].split(':')[1], edge[1].split(':')[1], float(edge[2].split('|')[0])] for edge in query_edgelist]

In [10]:
%%time
# Convert weighted edge list
InBioMap_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)

CPU times: user 1.89 s, sys: 197 ms, total: 2.09 s
Wall time: 1.87 s


In [11]:
# Filter converted edge list
InBioMap_edgelist_symbol_filt = gct.filter_converted_edgelist(InBioMap_edgelist_symbol, weighted=True)

625641 input edges
2498 self-edges removed
12249 edges with un-mapped genes removed
4896 duplicate edges removed
Edge list filtered: 3.15 seconds
605998 Edges remaining


In [12]:
# Write network to file
gct.write_edgelist(InBioMap_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', binary=False)

Edge list saved: 1.77 seconds


In [13]:
# Create filtered network
InBioMap90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                       q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/InBioMap90_Symbol.sif')

90.0% score: 1.0
0 / 605998 edges retained


In [14]:
# The filter function didn't work here because the max value makes up >90% of the edges. 
# We need to filter but keep all max edges instead
InBioMap_edgelist = pd.DataFrame(InBioMap_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'edgeScore'])
q_score = InBioMap_edgelist['edgeScore'].quantile(0.9)
InBioMap_edgelist_filt = InBioMap_edgelist[InBioMap_edgelist['edgeScore']>=q_score]
print InBioMap_edgelist_filt.shape[0], '/', InBioMap_edgelist.shape[0], 'edges kept, ', float(InBioMap_edgelist_filt.shape[0])/InBioMap_edgelist.shape[0]

151352 / 605998 edges kept,  0.249756599857


In [15]:
# Keeping all edges where the score == 1, it's a top 75% network, we will save this
InBioMap_edgelist_filt[['NodeA', 'NodeB']].to_csv(wd+'Network_SIFs_Symbol/InBioMap75_Symbol.sif', sep='\t', index=False, header=False)