In [1]:
import pandas as pd
import gene_conversion_tools as gct

## Load PID Raw Data
#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.pid.BINARY_SIF.hgnc.txt.sif.gz
Downloaded: September 29, 2016  
Last Updated (via Pathway Commons v8 datasources.txt file): July 27, 2015

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
PID_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons.8.pid.BINARY_SIF.hgnc.txt.sif',sep='\t',header=-1)

In [3]:
# Extract gene list
PID_Raw_Genes = list(set(PID_Raw[0]).union(set(PID_Raw[2])))

In [4]:
# Get edge list of network
query_edgelist = PID_Raw[[0,2]].values.tolist()
print len(query_edgelist), "PID Edges"

28076 PID Edges


## Convert Genes

In [5]:
query_string, valid_genes, invalid_genes = gct.query_constructor(PID_Raw_Genes, exclude_prefixes=['CHEBI'])

2588 Valid Query Genes
38 Invalid Query Genes


In [6]:
# filter edgelist because len(invalid_genes)>0
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)


584 / 28076 edges with invalid nodes removed


In [7]:
# Set scopes (gene naming systems to search)
scopes = "symbol, entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [8]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 5.95 seconds
3035 Matched query results


In [9]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1

278 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 2.97 seconds


## Construct Converted Network

In [10]:
PID_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt, query_to_symbol)
PID_edgelist_symbol_filt = gct.filter_converted_edgelist(PID_edgelist_symbol)
gct.write_edgelist(PID_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/PID_Symbol.sif')

27492 input edges
0 self-edges removed
2 edges with un-mapped genes removed
6014 duplicate edges removed
Edge list filtered: 0.02 seconds
21476 Edges remaining
Edge list saved: 0.55 seconds


In [12]:
PID_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt, query_to_entrez)
PID_edgelist_entrez_filt = gct.filter_converted_edgelist(PID_edgelist_entrez)
gct.write_edgelist(PID_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/PID_Entrez.sif')

27492 input edges
0 self-edges removed
2 edges with un-mapped genes removed
6014 duplicate edges removed
Edge list filtered: 0.02 seconds
21476 Edges remaining
Edge list saved: 0.03 seconds
