In [1]:
import pandas as pd
import itertools
import gene_conversion_tools as gct

## Load IntAct Raw Data
#### Source (PSI-MITAB): ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
Downloaded: October 03, 2016  
Last Updated: September 08, 2016  
Notes for processing: All interactions listed here need to be filtered for human-human interactions. Given the size of the file, we will filter the interactions and save the human-only interactions to a separate file to be loaded to save memory.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
IntAct_Raw = pd.read_csv(wd+'Network_Data_Raw/IntAct/2016-09-08_intact.txt', sep='\t')
print 'Raw edge count in IntAct:', len(IntAct_Raw)

  interactivity=interactivity, compiler=compiler, result=result)


Raw edge count in IntAct: 653104


## Custom Processing of Raw DIP Data

In [3]:
IntAct_Raw.columns

Index([u'#ID(s) interactor A', u'ID(s) interactor B',
       u'Alt. ID(s) interactor A', u'Alt. ID(s) interactor B',
       u'Alias(es) interactor A', u'Alias(es) interactor B',
       u'Interaction detection method(s)', u'Publication 1st author(s)',
       u'Publication Identifier(s)', u'Taxid interactor A',
       u'Taxid interactor B', u'Interaction type(s)', u'Source database(s)',
       u'Interaction identifier(s)', u'Confidence value(s)',
       u'Expansion method(s)', u'Biological role(s) interactor A',
       u'Biological role(s) interactor B',
       u'Experimental role(s) interactor A',
       u'Experimental role(s) interactor B', u'Type(s) interactor A',
       u'Type(s) interactor B', u'Xref(s) interactor A',
       u'Xref(s) interactor B', u'Interaction Xref(s)',
       u'Annotation(s) interactor A', u'Annotation(s) interactor B',
       u'Interaction annotation(s)', u'Host organism(s)',
       u'Interaction parameter(s)', u'Creation date', u'Update date',
       u'Checksu

#### Keep only human-human interactions

In [4]:
# Filter  for only human-human interactions in IntAct
IntAct_Human_Only = IntAct_Raw[(IntAct_Raw['Taxid interactor A']=='taxid:9606(human)|taxid:9606(Homo sapiens)') & (IntAct_Raw['Taxid interactor B']=='taxid:9606(human)|taxid:9606(Homo sapiens)')]
IntAct_Human_Only.drop_duplicates().to_csv(wd+'/Network_Data_Processed/IntAct_Human_Only.csv',sep='\t',index=False)

## Load filtered IntAct Data

In [5]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
IntAct_Human_Only = pd.read_csv(wd+'/Network_Data_Processed/IntAct_Human_Only.csv', sep='\t')
print 'Human-Human Only Edges in IntAct:', len(IntAct_Human_Only)

Human-Human Only Edges in IntAct: 247565


In [6]:
Human_IntAct_Genes = list(set(IntAct_Human_Only['#ID(s) interactor A']).union(set(IntAct_Human_Only['ID(s) interactor B'])))

## Convert Genes

In [7]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(Human_IntAct_Genes, exclude_prefixes=['intact'])

19166 Valid Query Genes
1139 Invalid Query Genes


In [8]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [9]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 41.09 seconds
19374 Matched query results


In [10]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 4326

150 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 107.59 seconds


## Construct Converted Network

In [11]:
query_edgelist = IntAct_Human_Only[['#ID(s) interactor A', 'ID(s) interactor B']].drop_duplicates().values.tolist()

In [12]:
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)

5841 / 161035 edges with invalid nodes removed


In [13]:
query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]

In [14]:
IntAct_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)
IntAct_edgelist_symbol_filt = gct.filter_converted_edgelist(IntAct_edgelist_symbol)
gct.write_edgelist(IntAct_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/IntAct_Symbol.sif')

155194 input edges
3088 self-edges removed
19794 edges with un-mapped genes removed
17281 duplicate edges removed
Edge list filtered: 0.17 seconds
115031 Edges remaining
Edge list saved: 0.28 seconds


In [15]:
IntAct_edgelist_entrez = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_entrez)
IntAct_edgelist_entrez_filt = gct.filter_converted_edgelist(IntAct_edgelist_entrez)
gct.write_edgelist(IntAct_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/IntAct_Entrez.sif')

155194 input edges
3123 self-edges removed
20041 edges with un-mapped genes removed
17263 duplicate edges removed
Edge list filtered: 0.13 seconds
114767 Edges remaining
Edge list saved: 0.12 seconds
