In [9]:
import pandas as pd
import numpy as np
import gene_conversion_tools as gct

## Load HINT Raw Data
#### Source: http://hint.yulab.org/batch.html
Downloaded: October 03, 2016  
Last update not listed, but currently on version 3. Each of the four binary interactome types listed (High-Quality (HQ), High-Throughput (HT), Literature Curated (LC), Co-Complex (CC)) are downloaded and merged into a single interactome for HINT. Co-complex data will be processed such that each complex is a completely connected module.   
Citation: Das J and Yu H. HINT: High-quality protein interactomes and their applications in understanding human disease. BMC Systems Biology, 2012 Jul 30;6(1):92.

In [10]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
HINT_HQ_Raw = pd.read_csv(wd+'Network_Data_Raw/HomoSapiens_binary_hq.txt',sep='\t')
HINT_HT_Raw = pd.read_csv(wd+'Network_Data_Raw/HomoSapiens_binary_ht.txt',sep='\t')
HINT_LC_Raw = pd.read_csv(wd+'Network_Data_Raw/HomoSapiens_binary_lc.txt',sep='\t')
HINT_CC_Raw = pd.read_csv(wd+'Network_Data_Raw/HomoSapiens_cocomp_hq.txt',sep='\t')

In [11]:
print HINT_HQ_Raw.shape, HINT_HT_Raw.shape, HINT_LC_Raw.shape, HINT_CC_Raw.shape

HINT_Raw = pd.concat([HINT_HQ_Raw, HINT_HT_Raw, HINT_LC_Raw, HINT_CC_Raw])
print 'Concatenated list of edges:', HINT_Raw.shape
HINT_Raw = HINT_Raw.drop_duplicates()
print 'After duplicate edges removed:', HINT_Raw.shape

(55200, 7) (44222, 7) (13504, 7) (25383, 7)
Concatenated list of edges: (138309, 7)
After duplicate edges removed: (80101, 7)


In [12]:
# Use all gene label information and choose best fit later
HPRD_Raw_Genes_Symbol = set(HINT_Raw['Gene_A']).union(set(HINT_Raw['Gene_B']))
HPRD_Raw_Genes_Uniprot = set(HINT_Raw['Uniprot_A']).union(set(HINT_Raw['Uniprot_B']))
HPRD_Raw_Genes_EntrezID = set(HINT_Raw['Alt_A']).union(set(HINT_Raw['Alt_A']))
HPRD_Raw_Genes_EntrezID = [str(int(val)) for val in HPRD_Raw_Genes_EntrezID if not np.isnan(val)]
HPRD_Raw_Genes = set(list(HPRD_Raw_Genes_Symbol)+list(HPRD_Raw_Genes_Uniprot)+HPRD_Raw_Genes_EntrezID)

In [13]:
# Get edge list of network
query_edgelist = HINT_Raw[['Uniprot_A', 'Alt_A','Gene_A','Uniprot_B', 'Alt_B','Gene_B']].values.tolist()
print len(HINT_Raw), "Total HINT Edges"

80101 Total HINT Edges


## Convert Genes

In [14]:
query_string, valid_genes, invalid_genes = gct.query_constructor(HPRD_Raw_Genes, print_invalid_genes=True)

36145 Valid Query Genes
56 Invalid Query Genes:
['CGB|CGB5|CGB7|CGB8', 'GAGE2D|GAGE8', 'AMY1A|AMY1B|AMY1C', 'GOLGA8F|GOLGA8G', 'SERF1A|SERF1B', 'HIST1H2BC|HIST1H2BE|HIST1H2BF|HIST1H2BG|HIST1H2BI', 'GAGE12B|GAGE12C|GAGE12D|GAGE12E', 'HSFX1|HSFX2', 'HSFY1|HSFY2', 'DEFB4A|DEFB4B', 'GTF2H2C|GTF2H2C_2', 'TUBA3C|TUBA3D', 'MAGEA2|MAGEA2B', 'SSX2|SSX2B', 'HIST1H2AB|HIST1H2AE', 'MAGED4|MAGED4B', 'HIST2H3A|HIST2H3C|HIST2H3D', 'LGALS7|LGALS7B', 'HIST2H2AA3|HIST2H2AA4', 'DEFA1|DEFA1B', 'XAGE1A|XAGE1B|XAGE1C|XAGE1D|XAGE1E', 'TEX28|TEX28P1|TEX28P2', 'SMN1|SMN2', 'SLX1A|SLX1B', 'CTAG1A|CTAG1B', 'IFNA1|IFNA13', 'BCR/ABL|FUSION', 'BOLA2|BOLA2B', 'C4B|C4B_2', 'HIST1H4A|HIST1H4B|HIST1H4C|HIST1H4D|HIST1H4E|HIST1H4F|HIST1H4H|HIST1H4I|HIST1H4J|HIST1H4K|HIST1H4L|HIST2H4A|HIST2H4B|HIST4H4', 'MIR|CL-10', 'CALM1|CALM2|CALM3', 'C2ORF27A|C2ORF27B', 'FAM74A4|FAM74A6', 'CCL4L1|CCL4L2', 'H3F3A|H3F3B', 'FAM156A|FAM156B', 'FAM74A1|FAM74A2', 'OK/KNS-CL.4', 'OK/KNS-CL.1', 'MAGEA9|MAGEA9B', 'CCL3L1|CCL3L3', 'RBMY1F|RBMY1

In [15]:
# Set scopes (gene naming systems to search)
scopes = "uniprot, symbol, entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [16]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 98.94 seconds
37949 Matched query results


In [17]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 698

1154 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 536.06 seconds


## Construct Converted Network

In [18]:
# Separate each edge in to two lists of node identifiers to be passed into conversion function
query_edgelist_split = []
for edge in query_edgelist:
    if np.isnan(edge[1]):
        node1 = [edge[0], edge[2]]
    else:
        node1 = [edge[0], str(int(edge[1])), edge[2]]
    if np.isnan(edge[4]):
        node2 = [edge[3], edge[4]]
    else:
        node2 = [edge[3], str(int(edge[4])), edge[5]]
    query_edgelist_split.append([node1, node2])

In [25]:
%%time
HINT_edgelist_symbol = [sorted([gct.convert_custom_namelist(edge[0],'symbol',match_table_trim),
                                gct.convert_custom_namelist(edge[1],'symbol',match_table_trim)]) for edge in query_edgelist_split]

CPU times: user 31min 38s, sys: 3.37 s, total: 31min 41s
Wall time: 31min 45s


In [26]:
%%time
HINT_edgelist_entrez = [sorted([gct.convert_custom_namelist(edge[0],'entrez',match_table_trim),
                                gct.convert_custom_namelist(edge[1],'entrez',match_table_trim)]) for edge in query_edgelist_split]

CPU times: user 31min 35s, sys: 3.07 s, total: 31min 38s
Wall time: 31min 43s


In [27]:
HINT_edgelist_symbol_filt = gct.filter_converted_edgelist(HINT_edgelist_symbol)
gct.write_edgelist(HINT_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HINT_Symbol.sif')

80101 input edges
4478 self-edges removed
1004 edges with un-mapped genes removed
10440 duplicate edges removed
Edge list filtered: 0.1 seconds
64179 Edges remaining
Edge list saved: 0.16 seconds


In [28]:
HINT_edgelist_entrez_filt = gct.filter_converted_edgelist(HINT_edgelist_entrez)
gct.write_edgelist(HINT_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/HINT_Entrez.sif')

80101 input edges
4477 self-edges removed
1027 edges with un-mapped genes removed
10440 duplicate edges removed
Edge list filtered: 0.09 seconds
64157 Edges remaining
Edge list saved: 0.08 seconds
