In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

## Load GeneMANIA Raw Data
#### Source: http://genemania.org/data/current/Homo_sapiens.COMBINED/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt
Downloaded: July 28, 2016  
Last Updated: October 15, 2014	

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
GeneMANIA_Raw = pd.read_csv(wd+'/Network_Data_Raw/GeneMANIA/GeneMANIA_2014_10_15.txt',sep='\t')

In [3]:
GeneMANIA_Raw_Genes = list(set(GeneMANIA_Raw['Gene_A']).union(set(GeneMANIA_Raw['Gene_B'])))

In [4]:
# Get Edgelist of network
query_edgelist = GeneMANIA_Raw[['Gene_A','Gene_B', 'Weight']].values.tolist()
print len(query_edgelist), "Total GeneMANIA Edges"

7290094 Total GeneMANIA Edges


## Convert Genes (from ensembl gene to gene symbol)

In [5]:
query_string, valid_genes, invalid_genes = gct.query_constructor(GeneMANIA_Raw_Genes)

19264 Valid Query Genes
0 Invalid Query Genes


In [6]:
# Set scopes (gene naming systems to search)
scopes = "ensemblgene"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [7]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 35.43 seconds
19266 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1547

1 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 111.04 seconds


## Construct Converted Network

In [9]:
%%time
# Convert weighted edge list
GeneMANIA_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)

CPU times: user 18.5 s, sys: 1.36 s, total: 19.9 s
Wall time: 19.5 s


In [10]:
# Filter converted edge list
GeneMANIA_edgelist_symbol_filt = gct.filter_converted_edgelist(GeneMANIA_edgelist_symbol, weighted=True)

7290094 input edges
22144 self-edges removed
665798 edges with un-mapped genes removed
508 duplicate edges removed
Edge list filtered: 39.33 seconds
6601644 Edges remaining


In [11]:
# Write network to file
gct.write_edgelist(GeneMANIA_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', binary=False)

Edge list saved: 13.39 seconds


In [12]:
# Create filtered network
GeneMANIA90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                       q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GeneMANIA90_Symbol.sif')

90.0% score: 0.00023
618546 / 6601644 edges retained
