In [1]:
import pandas as pd
import itertools
import gene_conversion_tools as gct

## Load iRefIndex Raw Data
#### Source (MITAB): http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/9606.mitab.07042015.txt.zip
Downloaded: July 28, 2016  
Last Updated: April 20, 2015  
Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below
### From iRefIndex Mapping Documentation Page:
"We have made a file which provides mappings between iRefIndex identifiers and popular external identifiers. The current files contain all UniProt and RefSeq identifiers known to the current version of iRefIndex as documented on the sources page. For specific source documentation, see the sources for each released version.  
  
Other database identifiers are provided as database/accession pairs only when the iRefIndex identifier (ROGID) does not have a corresponding UniProt or RefSeq record with an identical sequence."  
  
Therefore: Interactions containing an ROGID identifier will be removed

In [3]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
iRefIndex_Raw = pd.read_csv(wd+'Network_Data_Raw/iRefIndex/9606.mitab.04072015.txt',sep='\t')
print 'Raw edge count in iRefIndex:', len(iRefIndex_Raw)

 Raw edge count in iRefIndex: 673100


## Custom Processing of Raw DIP Data

#### Keep only human-human interactions

In [13]:
iRef_Human_only = iRefIndex_Raw[(iRefIndex_Raw['taxa']=='taxid:9606(Homo sapiens)') & (iRefIndex_Raw['taxb']=='taxid:9606(Homo sapiens)')]
print 'Human-Human only interactions in iRefIndex:', len(iRef_Human_only)

Human-Human only interactions in iRefIndex: 485030


#### Parse all genes in filtered iRef and keep only RefSeq/UniProtKB labelled interactions

In [14]:
# Extract gene list
Human_iRef_Genes = list(set(iRef_Human_only['#uidA']).union(set(iRef_Human_only['uidB'])))

In [16]:
# Get all iRef prefixes
prefixes=[]
for gene in Human_iRef_Genes:
    prefix=gene.split(':')[0]
    if prefix not in prefixes:
        prefixes.append(prefix)
print prefixes

['uniprotkb', 'refseq', 'rogid']


In [19]:
# Get edge list of network
query_edgelist = iRef_Human_only[['#uidA','uidB']].values.tolist()
print len(query_edgelist), "Human iRefIndex Edges"

485030 Human iRefIndex Edges


## Convert Genes

In [18]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(Human_iRef_Genes, exclude_prefixes=['rogid'])

23906 Valid Query Genes
945 Invalid Query Genes


In [20]:
# filter edgelist because len(invalid_genes)>0
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)

6305 / 485030 edges with invalid nodes removed


In [23]:
# Set scopes (gene naming systems to search)
scopes = "uniprot, refseq"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [24]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 52.63 seconds
24105 Matched query results


In [25]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 6122

150 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 196.61 seconds


## Construct Converted Network

In [28]:
query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]),gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]

In [30]:
iRefIndex_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)
iRefIndex_edgelist_symbol_filt = gct.filter_converted_edgelist(iRefIndex_edgelist_symbol)
gct.write_edgelist(iRefIndex_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/iRefIndex_Symbol.sif')

478725 input edges
34079 self-edges removed
131249 edges with un-mapped genes removed
179474 duplicate edges removed
Edge list filtered: 0.5 seconds
133923 Edges remaining
Edge list saved: 0.28 seconds


In [31]:
iRefIndex_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_entrez)
iRefIndex_edgelist_entrez_filt = gct.filter_converted_edgelist(iRefIndex_edgelist_entrez)
gct.write_edgelist(iRefIndex_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/iRefIndex_Entrez.sif')

478725 input edges
34096 self-edges removed
131404 edges with un-mapped genes removed
179359 duplicate edges removed
Edge list filtered: 0.37 seconds
133866 Edges remaining
Edge list saved: 0.11 seconds
