In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
import time

## Load Reactome-Functional Interactions Raw Data
#### Source: http://reactomews.oicr.on.ca:8080/caBigR3WebApp2016/FIsInGene_022717_with_annotations.txt.zip
Downloaded: June 15, 2017  
Last Updated: February 27, 2017  
Note about processing: It looks like most of the edges are given as gene symbols but many of them seem to be invalid names, so we will use some of the gene conversion tools to filter these results as best we can.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
Reactome_FIs_Raw = pd.read_csv(wd+'Network_Data_Raw/FIsInGene_022717_with_annotations.txt',sep='\t')
print 'Raw edges in ReactomeFI:', Reactome_FIs_Raw.shape[0]

Raw edges in ReactomeFI: 230243


In [3]:
# Extract gene list
Reactome_FIs_Raw_Genes = list(set(Reactome_FIs_Raw['Gene1']).union(set(Reactome_FIs_Raw['Gene2'])))

In [4]:
# Find "invalid genes" by text format
query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_FIs_Raw_Genes, exclude_prefixes=['CHEBI'], print_invalid_genes=True)

12254 Valid Query Genes
23 Invalid Query Genes:
['YWHAE/FAM22B FUSION', 'RUNX1/C20ORF112 FUSION', 'IGKV A18', 'APC VARIANT PROTEIN', 'STAG1 VARIANT PROTEIN', 'MIR CL-10', 'BETA 2-MICROGLOBULIN', 'BCR/ABL FUSION', 'ATP2B2 VARIANT PROTEIN', 'ITGA7 VARIANT PROTEIN', '<ALPHA><BETA>CREB-1', 'CD40 LIGAND', 'NUMA1 VARIANT PROTEIN', 'PIK4CA VARIANT PROTEIN', 'EPHB2 VARIANT PROTEIN', 'RUNX1/CBFA2T2 FUSION', 'TNC VARIANT PROTEIN', 'PIK3C2B VARIANT PROTEIN', 'PLCG1 VARIANT PROTEIN', 'WUGSC:H_GS165O14.2', 'PIK3CA VARIANT PROTEIN', 'YWHAE/FAM22A FUSION', 'PDHA1/LOC79064']


In [5]:
# Get Edgelist of network
query_edgelist = Reactome_FIs_Raw[['Gene1','Gene2', 'Score']].values.tolist()

In [6]:
# Filter query edges
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)

820 / 230243 edges with invalid nodes removed


In [7]:
# Filter edge list
ReactomeFI_edgelist_filt = gct.filter_converted_edgelist(query_edgelist_filt, weighted=True)

229423 input edges
0 self-edges removed
0 edges with un-mapped genes removed
0 duplicate edges removed
Edge list filtered: 1.95 seconds
229423 Edges remaining


In [8]:
# Save filtered, converted edge list to file
gct.write_edgelist(ReactomeFI_edgelist_filt, wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', binary=False)

Edge list saved: 0.68 seconds


In [36]:
# Create filtered network
ReactomeFI90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                        q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/ReactomeFI90_edgelist_Symbol.sif')

90.0% score: 1.0
0 / 229423 edges retained


In [37]:
# The filter function didn't work here because the max value makes up >90% of the edges. 
# We need to filter but keep all max edges instead
ReactomeFI_edgelist = pd.DataFrame(ReactomeFI_edgelist_filt, columns=['NodeA', 'NodeB', 'Score'])
q_score = ReactomeFI_edgelist['Score'].quantile(0.9)
ReactomeFI_edgelist_filt2 = ReactomeFI_edgelist[ReactomeFI_edgelist['Score']>=q_score]
print ReactomeFI_edgelist_filt2.shape[0], '/', ReactomeFI_edgelist.shape[0], 'edges kept, ', float(ReactomeFI_edgelist_filt2.shape[0])/ReactomeFI_edgelist.shape[0]

198541 / 229423 edges kept,  0.86539274615


In [None]:
# Essentially >85% of the edges have the 'maximum score' which makes almost no sense for filtering further