In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
import time

## Load Mentha Raw Data
#### Source (MITAB): http://mentha.uniroma2.it/doDownload.php?file=2017-06-12_MITAB-2.5.zip
Downloaded: June 15, 2017  
Last Updated: June 12, 2017  
Notes for processing: This is the file should contain only human-human protein interactions but this should be checked and filtered if needed.  
A Note about scoring: Mentha does have a score assigned for each interaction called the 'mentha-score', this will be the score we use to filter the network.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
Mentha_Raw = pd.read_csv(wd+'Network_Data_Raw/mentha_2017_06_12', sep='\t', header=-1)
print 'Raw edge count in Mentha:', len(Mentha_Raw)

Raw edge count in Mentha: 1114184


In [3]:
# Keep only human-human interactions
Mentha_Human_only = Mentha_Raw[(Mentha_Raw[9]=='taxid:9606(Homo sapiens)') & (Mentha_Raw[10]=='taxid:9606(Homo sapiens)')]
print 'Human-Human only interactions in Mentha:', len(Mentha_Human_only)

Human-Human only interactions in Mentha: 531726


In [4]:
# Extract gene list
Human_Mentha_Genes = list(set(Mentha_Human_only[0]).union(set(Mentha_Human_only[1])))

## Convert Network Genes to symbol from UniProt Accession ID

In [5]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(Human_Mentha_Genes)

18626 Valid Query Genes
0 Invalid Query Genes


In [6]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 62.69 seconds
18932 Matched query results


In [7]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1198
The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.

207 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 83.92 seconds


## Construct Converted Network

In [8]:
query_edgelist = Mentha_Human_only[[0, 1, 14]].drop_duplicates().values.tolist()

In [9]:
# Format edge list by removing 'uniprot:' prefix from all interactors
query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1]), float(edge[2].split(':')[-1])] for edge in query_edgelist]

In [10]:
# Convert network edge list to symbol
Mentha_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)

In [11]:
# Filter converted edge list
Mentha_edgelist_symbol_filt = gct.filter_converted_edgelist(Mentha_edgelist_symbol, weighted=True)

327857 input edges
3247 self-edges removed
8219 edges with un-mapped genes removed
53515 duplicate edges removed
Edge list filtered: 1.61 seconds
262876 Edges remaining


In [12]:
# Save filtered, converted edge list to file
gct.write_edgelist(Mentha_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', binary=False)

Edge list saved: 0.79 seconds


In [13]:
# Create filtered network
Mentha90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                    q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/Mentha90_Symbol.sif')

90.0% score: 0.454
22886 / 262876 edges retained
