In [1]:
import pandas as pd
import gene_conversion_tools as gct

## Load Reactome Raw Data
#### Source: http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz
#### File to download: The link labelled "Human protein-protein interaction pairs in tab-delimited format" seems to have many more interactions than the MITAB file format. This is the file that we will use for this network.
Downloaded: October 03, 2016  
Last Updated: June 27, 2016  

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
Reactome_Raw = pd.read_csv(wd+'Network_Data_Raw/Reactome_v57.interactions.txt',sep='\t',skiprows=1, header=-1, low_memory=False)

In [3]:
# Get edge list of network (filter for duplicate edges and self-edges)
print len(Reactome_Raw), "Raw Reactome Edges"
query_edgelist_filt = Reactome_Raw[[0,3]].drop_duplicates()
print len(query_edgelist_filt), "Raw Reactome Edges after removing duplicate edges"
query_edgelist_filt2 = query_edgelist_filt[query_edgelist_filt[0]!=query_edgelist_filt[3]]
print len(query_edgelist_filt2), "Raw Reactome Edges after removing duplicate and self-edges"
query_edgelist = query_edgelist_filt2.values.tolist()

2751782 Raw Reactome Edges
207150 Raw Reactome Edges after removing duplicate edges
203104 Raw Reactome Edges after removing duplicate and self-edges


In [4]:
# Extract gene list
Reactome_Raw_Genes = list(set(query_edgelist_filt2[0]).union(set(query_edgelist_filt2[3])))

## Convert Genes

In [5]:
query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_Raw_Genes, exclude_prefixes=['CHEBI'])

7794 Valid Query Genes
0 Invalid Query Genes


In [6]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [7]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 40.53 seconds
7889 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 552

68 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 16.41 seconds


## Construct Converted Network

In [9]:
query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist]

In [10]:
Reactome_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol)
Reactome_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_edgelist_symbol)
gct.write_edgelist(Reactome_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome_Symbol.sif')

203104 input edges
13420 self-edges removed
21509 edges with un-mapped genes removed
1822 duplicate edges removed
Edge list filtered: 0.18 seconds
166353 Edges remaining
Edge list saved: 0.3 seconds


In [11]:
Reactome_edgelist_entrez = gct.convert_edgelist(query_edgelist_fmt, query_to_entrez)
Reactome_edgelist_entrez_filt = gct.filter_converted_edgelist(Reactome_edgelist_entrez)
gct.write_edgelist(Reactome_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/Reactome_Entrez.sif')

203104 input edges
15116 self-edges removed
20739 edges with un-mapped genes removed
1822 duplicate edges removed
Edge list filtered: 0.16 seconds
165427 Edges remaining
Edge list saved: 0.15 seconds
