In [1]:
import pandas as pd
import itertools
import gene_conversion_tools as gct

## Load MultiNet Raw Data
#### Source: http://homes.gersteinlab.org/Khurana-PLoSCompBio-2013/
Downloaded: August 12, 2016  
Last Updated: March 17, 2013
Processing Notes: MultiNet has labels which interactions are noted as PPI and which are not. In the initial case, we will be examining all interaction information for MultiNet. However, in this case it is simple enough to parse the PPI only information from the data, and can be done in future work if necessary.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
MultiNet_Raw = pd.read_csv(wd+'Network_Data_Raw/Multinet.interactions.network_presence_2013_03_17.txt',sep='\t')

In [3]:
query_edgelist = [interaction.split('_') for interaction in MultiNet_Raw['INTERACTION_NAME']]
MultiNet_Raw_Genes = list(set(itertools.chain.from_iterable(query_edgelist)))

## Convert Genes

In [4]:
query_string, valid_genes, invalid_genes = gct.query_constructor(MultiNet_Raw_Genes, print_invalid_genes=True)

14440 Valid Query Genes
5 Invalid Query Genes:
['RP6-213H19.1', 'SULT1A3.1', 'VARSL.5', 'CN5H6.4', 'NOP5/NOP58']


In [5]:
# filter edgelist because len(invalid_genes)>0
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)


24 / 109598 edges with invalid nodes removed


In [6]:
# Set scopes (gene naming systems to search)
scopes = "symbol, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [7]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 40.63 seconds
16364 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 269

1069 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 61.02 seconds


## Construct Converted Network

In [9]:
MultiNet_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt, query_to_symbol)
MultiNet_edgelist_symbol_filt = gct.filter_converted_edgelist(MultiNet_edgelist_symbol)
gct.write_edgelist(MultiNet_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/MultiNet_Symbol.sif')

109574 input edges
30 self-edges removed
1572 edges with un-mapped genes removed
37 duplicate edges removed
Edge list filtered: 0.12 seconds
107935 Edges remaining
Edge list saved: 0.22 seconds


In [10]:
MultiNet_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt, query_to_entrez)
MultiNet_edgelist_entrez_filt = gct.filter_converted_edgelist(MultiNet_edgelist_entrez)
gct.write_edgelist(MultiNet_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/MultiNet_Entrez.sif')

109574 input edges
30 self-edges removed
1579 edges with un-mapped genes removed
37 duplicate edges removed
Edge list filtered: 0.31 seconds
107928 Edges remaining
Edge list saved: 0.13 seconds
