In [1]:
import pandas as pd
import gene_conversion_tools as gct

## Load Reactome-Functional Interactions Raw Data
#### Source: http://reactomews.oicr.on.ca:8080/caBigR3WebApp2015/FIsInGene_031516_with_annotations.txt.zip
Downloaded: July 28, 2016  
Last Updated: March 15, 2016  

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
Reactome_FIs_Raw = pd.read_csv(wd+'Network_Data_Raw/FIsInGene_031516_with_annotations.txt',sep='\t')

In [17]:
# Extract gene list
Reactome_FIs_Raw_Genes = list(set(Reactome_FIs_Raw['Gene1']).union(set(Reactome_FIs_Raw['Gene2'])))

In [18]:
# Get Edgelist of network
query_edgelist = Reactome_FIs_Raw[['Gene1','Gene2']].values.tolist()
print len(query_edgelist), "Total Reactome Functional Interactions"

229300 Total Reactome Functional Interactions


## Convert Genes

In [19]:
query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_FIs_Raw_Genes, exclude_prefixes=['CHEBI'], print_invalid_genes=True)

12149 Valid Query Genes
28 Invalid Query Genes:
['IGKV A18', 'P27 KIP1', 'KIF5B-RET(NM_020975)_K23', 'KIF5B-RET(NM_020975)_K22', 'ARID1A VARIANT PROTEIN', 'APC VARIANT PROTEIN', 'KIF5B-RET(NM_020630)_K23', 'KIF5B-RET(NM_020630)_K22', 'KIF5B-RET(NM_020630)_K16', 'KIF5B-RET(NM_020630)_K15', 'PRKAR1A/RARA FUSION', 'KIF5B-RET(NM_020975)_K16', 'MIR CL-10', 'BCR/ABL FUSION', 'HS24/P52', 'TNPO2 VARIANT PROTEIN', 'ATP2B2 VARIANT PROTEIN', '<DELTA>FAS/APO-1/CD95', 'EIF4G1 VARIANT PROTEIN', 'ITGA7 VARIANT PROTEIN', 'PIK4CA VARIANT PROTEIN', 'EPHB2 VARIANT PROTEIN', 'STEM CELL FACTOR: SCF', 'MDM2 ISOFORM KB9', 'TNC VARIANT PROTEIN', 'KIF5B-RET(NM_020630)_K24', 'HTR<BETA>1', 'PLCG1 VARIANT PROTEIN']


In [20]:
# Filter query edges
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)

564 / 229300 edges with invalid nodes removed


In [21]:
# Set scopes (gene naming systems to search)
scopes = "symbol, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [22]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 24.7 seconds
13806 Matched query results


In [23]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 135

994 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 46.16 seconds


## Construct Converted Network

In [24]:
Reactome_FIs_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt, query_to_symbol)
Reactome_FIs_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_FIs_edgelist_symbol)
gct.write_edgelist(Reactome_FIs_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome-FIs_Symbol.sif')

228736 input edges
36 self-edges removed
1944 edges with un-mapped genes removed
432 duplicate edges removed
Edge list filtered: 0.23 seconds
226324 Edges remaining
Edge list saved: 0.45 seconds


In [25]:
Reactome_FIs_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt, query_to_entrez)
Reactome_FIs_edgelist_entrez_filt = gct.filter_converted_edgelist(Reactome_FIs_edgelist_entrez)
gct.write_edgelist(Reactome_FIs_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/Reactome-FIs_Entrez.sif')

228736 input edges
36 self-edges removed
1963 edges with un-mapped genes removed
432 duplicate edges removed
Edge list filtered: 0.21 seconds
226305 Edges remaining
Edge list saved: 0.2 seconds
