In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
import time

## Load Reactome Raw Data
#### Source: http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz
#### File to download: The link labelled "Human protein-protein interaction pairs in tab-delimited format" seems to have many more interactions than the MITAB file format. This is the file that we will use for this network.
Downloaded: June 15, 2017    
Last Updated: April 20, 2017  

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
Reactome_Raw = pd.read_csv(wd+'Network_Data_Raw/Reactome_v60.interactions.txt',sep='\t',skiprows=1, header=-1, low_memory=False)
print 'Raw Edges in Reactome v60:', len(Reactome_Raw)

Raw Edges in Reactome v60: 2523567


In [3]:
# Get edge list of network (filter for duplicate edges and self-edges)
query_edgelist_filt = Reactome_Raw[[0,3]].drop_duplicates()
print len(query_edgelist_filt), "Raw Reactome Edges after removing duplicate edges"
query_edgelist_filt2 = query_edgelist_filt[query_edgelist_filt[0]!=query_edgelist_filt[3]]
print len(query_edgelist_filt2), "Raw Reactome Edges after removing duplicate and self-edges"
query_edgelist = query_edgelist_filt2.values.tolist()

214432 Raw Reactome Edges after removing duplicate edges
210066 Raw Reactome Edges after removing duplicate and self-edges


In [4]:
# Extract gene list
Reactome_Raw_Genes = list(set(query_edgelist_filt2[0]).union(set(query_edgelist_filt2[3])))

## Convert Genes from UniProtKB to Symbol

In [5]:
query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_Raw_Genes)

8387 Valid Query Genes
0 Invalid Query Genes


In [6]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 13.56 seconds
8518 Matched query results


In [7]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 511

102 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 17.83 seconds


## Construct Converted Network

In [8]:
# Format edge list by removing prefixes from all interactors
query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist]

In [9]:
# Convert network edge list to symbol
Reactome_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=False)

In [11]:
# Filter converted edge list
Reactome_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_edgelist_symbol, weighted=False)

210066 input edges
2708 self-edges removed
10886 edges with un-mapped genes removed
1970 duplicate edges removed
Edge list filtered: 0.51 seconds
194502 Edges remaining


In [12]:
# Save filtered, converted edge list to file
gct.write_edgelist(Reactome_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome_Symbol.sif', binary=True)

Edge list saved: 0.59 seconds
