In [1]:
import pandas as pd
import gene_conversion_tools as gct

## Load Pathway Commons Raw Data (All interactions)
#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif.gz
Downloaded: July 2, 2016  
Last Updated: May 03, 2016  
Citation: Pathway Commons, a web resource for biological pathway data. Cerami E et al. Nucleic Acids Research (2011).

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
PC_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons.8.All.BINARY_SIF.hgnc.txt.sif',sep='\t',header=-1)

In [3]:
# Extract gene list
PC_Raw_Genes = list(set(PC_Raw[0]).union(set(PC_Raw[2])))

In [4]:
# Get edge list of network
query_edgelist = PC_Raw[[0,2]].values.tolist()
print len(query_edgelist), "PC Edges"

1073936 PC Edges


## Convert Genes

In [5]:
query_string, valid_genes, invalid_genes = gct.query_constructor(PC_Raw_Genes, exclude_prefixes=['CHEBI'])

20073 Valid Query Genes
8266 Invalid Query Genes


In [8]:
# filter edgelist because len(invalid_genes)>0
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)


190725 / 1073936 edges with invalid nodes removed


In [9]:
# Set scopes (gene naming systems to search)
scopes = "symbol, entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [10]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 40.13 seconds
22230 Matched query results


In [11]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 15

1271 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 122.5 seconds


## Construct Converted Network

In [14]:
PC_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt, query_to_symbol)
PC_edgelist_symbol_filt = gct.filter_converted_edgelist(PC_edgelist_symbol)
gct.write_edgelist(PC_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/PathwayCommons_Symbol.sif')

883211 input edges
30 self-edges removed
940 edges with un-mapped genes removed
122639 duplicate edges removed
Edge list filtered: 0.93 seconds
759602 Edges remaining
Edge list saved: 1.27 seconds


In [16]:
PC_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt, query_to_entrez)
PC_edgelist_entrez_filt = gct.filter_converted_edgelist(PC_edgelist_entrez)
gct.write_edgelist(PC_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/PathwayCommons_Entrez.sif')

883211 input edges
30 self-edges removed
1018 edges with un-mapped genes removed
122638 duplicate edges removed
Edge list filtered: 0.73 seconds
759525 Edges remaining
Edge list saved: 0.56 seconds
