In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

## Load GIANT Raw Data
#### Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz
Downloaded: June 15, 2017  
Last Updated: N/A, but paper published in 2015  
Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network.

In [4]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
GIANT_Raw = pd.read_csv(wd+'/Network_Data_Raw/GIANT_All_Tissues_Top', sep='\t', header=-1, low_memory=False)
GIANT_Raw.columns = ['NodeA', 'NodeB', 'Prob']
print 'GIANT All Tissues (Top) Interactions:', GIANT_Raw.shape[0]

GIANT All Tissues (Top) Interactions: 38903547


In [5]:
# Get all genes to convert from GeneMANIA
GIANT_Raw_Genes = list(set(GIANT_Raw['NodeA']).union(GIANT_Raw['NodeB']))
# Convert all entrezIDs to string forst
GIANT_Raw_Genes = [str(entrezID) for entrezID in GIANT_Raw_Genes]

##  Convert genes from Entrez ID to HUGO Symbol

In [6]:
query_string, valid_genes, invalid_genes = gct.query_constructor(GIANT_Raw_Genes)

25689 Valid Query Genes
0 Invalid Query Genes


In [7]:
# Set scopes (gene naming systems to search)
scopes = "entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 30.55 seconds
25690 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 806

1 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 140.47 seconds


## Construct converted network and filter edges

In [9]:
GIANT_Raw_edgelist = GIANT_Raw.values.tolist()

In [13]:
# Convert GIANT network edgelist
GIANT_Raw_edgelist_symbol = [sorted([query_to_symbol[str(int(edge[0]))], query_to_symbol[str(int(edge[1]))]])+[edge[2]] for edge in GIANT_Raw_edgelist]

In [14]:
# Filter GIANT network edgelist
GIANT_edgelist_symbol_filt = gct.filter_converted_edgelist(GIANT_Raw_edgelist_symbol, remove_self_edges=True, weighted=True)

38903547 input edges
19204 self-edges removed
2417020 edges with un-mapped genes removed
151720 duplicate edges removed
Edge list filtered: 225.47 seconds
36315603 Edges remaining


## Filter to top 10% of edges by weight/score

In [16]:
GIANT_edgelist_symbol_filt_table = pd.DataFrame(GIANT_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'Score'])

In [20]:
# Filter edges by score quantile
q_score = GIANT_edgelist_symbol_filt_table['Score'].quantile(0.9)
print '90% score:', q_score
GIANTtop_edgelist = GIANT_edgelist_symbol_filt_table[GIANT_edgelist_symbol_filt_table['Score']>q_score]

90% score: 0.207416


In [23]:
# Save weighted network for GIANT filtered to top 10% of downloaded edges to file
GIANTtop_edgelist.to_csv('/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/GIANT_Symbol.sif', sep='\t', header=False, index=False)

In [24]:
# Create filtered network for GIANT
GIANT90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GIANT_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                   q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GIANT90_Symbol.sif')

90.0% score: 0.574097
363128 / 3631554 edges retained
