In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import numpy as np

## Load HINT Raw Data
#### Source: http://hint.yulab.org/batch.html
Downloaded: June 15, 2017  
Last update not listed, but currently on version 4 (updated early 2017). The two binary interactomes for High-Quality (HQ) and Co-Complex (CC) interactions were downloaded and merged into a single interactome for HINT.     
Citation: Das J and Yu H. HINT: High-quality protein interactomes and their applications in understanding human disease. BMC Systems Biology, 2012 Jul 30;6(1):92.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
HINT_Bin_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_binary_HomoSapiens.txt',sep='\t')
HINT_Com_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_complex_HomoSapiens.txt',sep='\t')

In [5]:
HINT_Raw = pd.concat([HINT_Bin_Raw, HINT_Com_Raw])
print 'Concatenated list of edges:', HINT_Raw.shape
HINT_Raw = HINT_Raw.drop_duplicates()
print 'After duplicate edges removed:', HINT_Raw.shape

Concatenated list of edges: (181699, 9)
After duplicate edges removed: (181375, 9)


In [7]:
# Use UniProtID labels to annotate interactions
HPRD_Raw_Genes_Uniprot = set(HINT_Raw['Uniprot_A']).union(set(HINT_Raw['Uniprot_B']))

## Convert Genes from UniProt Accession ID to gene symbols

In [9]:
query_string, valid_genes, invalid_genes = gct.query_constructor(HPRD_Raw_Genes_Uniprot)

15784 Valid Query Genes
0 Invalid Query Genes


In [10]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 19.17 seconds
16001 Matched query results


In [11]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 670

163 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 59.26 seconds


## Construct Converted Network

In [13]:
HINT_edgelist = HINT_Raw[['Uniprot_A', 'Uniprot_B']].values.tolist()

In [16]:
# Convert edge list
HINT_edgelist_symbol = gct.convert_edgelist(HINT_edgelist, query_to_symbol, weighted=False)

In [19]:
# Filter edge list
HINT_edgelist_symbol_filt = gct.filter_converted_edgelist(HINT_edgelist_symbol)

181375 input edges
4730 self-edges removed
2861 edges with un-mapped genes removed
18325 duplicate edges removed
Edge list filtered: 0.33 seconds
155459 Edges remaining


In [20]:
# Save edge list
gct.write_edgelist(HINT_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HINT_Symbol.sif')

Edge list saved: 0.26 seconds
