In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

## Load HumanNet Raw Data
#### Source: http://www.functionalnet.org/humannet/HumanNet.v1.benchmark.txt
Downloaded: August 12, 2016  
No latest version date posted (last updated likely around 2011).  
Citation: Insuk Lee, U. Martin Blom, Peggy I. Wang, Jung Eun Shin, and Edward M. Marcotte
Genome Research 21(7):1109-21 (2011)

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
HumanNet_Raw = pd.read_csv(wd+'Network_Data_Raw/HumanNet.v1.join.txt',sep='\t',header=-1)

In [3]:
f = open(wd+'Network_Data_Raw/HumanNet.v1.evidence_code.txt')
HumanNet_headers = ['Gene 1', 'Gene 2']+[name.split(' = ')[0] for name in f.read().splitlines()[1:-1]]
HumanNet_Raw.columns = HumanNet_headers

In [4]:
# Extract gene list
HumanNet_Raw_Genes = list(set(HumanNet_Raw['Gene 1']).union(set(HumanNet_Raw['Gene 2'])))
HumanNet_Raw_Genes = [str(gene) for gene in HumanNet_Raw_Genes]

In [5]:
# Get edge list of network
query_edgelist = HumanNet_Raw[['Gene 1','Gene 2']].astype(str)
query_edgelist = pd.concat([query_edgelist, HumanNet_Raw['IntNet']], axis=1).values.tolist()
print len(query_edgelist), "HumanNet Edges"

476399 HumanNet Edges


##  Convert genes from Entrez ID to HUGO Symbol

In [6]:
query_string, valid_genes, invalid_genes = gct.query_constructor(HumanNet_Raw_Genes)

16243 Valid Query Genes
0 Invalid Query Genes


In [7]:
# Set scopes (gene naming systems to search)
scopes = "entrezgene, retired"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 19.6 seconds
16243 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 10

0 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 19.62 seconds


## Construct Converted Network

In [9]:
%%time
# Convert weighted edge list
HumanNet_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)

CPU times: user 1.54 s, sys: 260 ms, total: 1.8 s
Wall time: 1.69 s


In [10]:
# Filter converted edge list
HumanNet_edgelist_symbol_filt = gct.filter_converted_edgelist(HumanNet_edgelist_symbol, weighted=True)

476399 input edges
7 self-edges removed
225 edges with un-mapped genes removed
208 duplicate edges removed
Edge list filtered: 4.15 seconds
475959 Edges remaining


In [11]:
# Write network to file
gct.write_edgelist(HumanNet_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', binary=False)

Edge list saved: 1.24 seconds


In [15]:
# Create filtered network
HumanNet90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                      q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/HumanNet90_Symbol.sif')

90.0% score: 2.17047289928
47595 / 475959 edges retained
