In [1]:
import pandas as pd
import itertools
import gene_conversion_tools as gct

## Load PID Raw Data
#### Source (MITAB): http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/Hsapi20160731.txt
Downloaded: August 12, 2016  
Last Updated: July 31, 2016  
Notes for download: Website requires registration. Register for the site to download the file from the link.  
Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
DIP_Raw = pd.read_csv(wd+'Network_Data_Raw/DIP/Hsapi20160731.txt',sep='\t')
print 'Raw edge count in DIP:', len(DIP_Raw)

Raw edge count in DIP: 7752


## Custom Processing of Raw DIP Data

#### Keep only human-human interactions

In [3]:
DIP_Human_only = DIP_Raw[(DIP_Raw['Taxid interactor A']=='taxid:9606(Homo sapiens)') & (DIP_Raw['Taxid interactor B']=='taxid:9606(Homo sapiens)')]
print 'Human-Human only interactions in DIP:', len(DIP_Human_only)

Human-Human only interactions in DIP: 5537


#### Parse all genes in filtered DIP and keep only RefSeq/UniProtKB labelled interactions

In [4]:
# Extract gene list
Human_DIP_Genes = list(set(DIP_Human_only['ID interactor A']).union(set(DIP_Human_only['ID interactor B'])))

In [5]:
# Split all gene names into list of genes and concatenate
Human_DIP_Genes_split = [name.split('|') for name in Human_DIP_Genes]
Human_DIP_Genes_full_list = list(itertools.chain.from_iterable(Human_DIP_Genes_split))

# Note about this line: This is to fix the one example where one of the Uniprot genes gets labelled as "uniprotkb:Q13936,159'
Human_DIP_Genes_full_list = [name.split(',')[0] for name in Human_DIP_Genes_full_list] 

## Convert Genes

In [6]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(Human_DIP_Genes_full_list, exclude_prefixes=['DIP'])

4996 Valid Query Genes
3261 Invalid Query Genes


In [7]:
# Set scopes (gene naming systems to search)
scopes = "uniprot, refseq, symbol, entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [8]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 14.49 seconds
5052 Matched query results


In [9]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 105

72 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 7.36 seconds


## Construct Converted Network

In [10]:
# This is a custom gene conversion function written due to the parsing required for gene interactor labels
# Returns best matched symbol and/or entrez id from each DIP interactor string (if applicable)
def convert_DIP_string(string, field):
    names = [gct.get_identifier_without_prefix(name) for name in string.split('|')]
    # Keep only mappings defined for field of interest
    if field=='symbol':
        # Return match table values that have matched symbol
        conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['Symbol'].isnull())]
        if conversion.shape[0]==0:
            return None
        else:
            # Return conversion with max score or None if no conversion
            max_score = conversion['Score'].max()
            return conversion[conversion['Score']==max_score].ix[0]['Symbol']
    elif field=='entrez':
        # Return match table values that have matched symbol
        conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['EntrezID'].isnull())]
        if conversion.shape[0]==0:
            return None
        else:
            # Return conversion with max score or None if no conversion
            max_score = conversion['Score'].max()
            return conversion[conversion['Score']==max_score].ix[0]['EntrezID']

In [11]:
DIP_Human_only_edges = DIP_Human_only[['ID interactor A', 'ID interactor B']].values.tolist()
DIP_edgelist_symbol = [sorted([convert_DIP_string(edge[0],'symbol'),convert_DIP_string(edge[1],'symbol')]) for edge in DIP_Human_only_edges]
DIP_edgelist_entrez = [sorted([convert_DIP_string(edge[0],'entrez'),convert_DIP_string(edge[1],'entrez')]) for edge in DIP_Human_only_edges]

In [12]:
DIP_edgelist_symbol_filt = gct.filter_converted_edgelist(DIP_edgelist_symbol)
gct.write_edgelist(DIP_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/DIP_Symbol.sif')

5537 input edges
510 self-edges removed
307 edges with un-mapped genes removed
26 duplicate edges removed
Edge list filtered: 0.01 seconds
4694 Edges remaining
Edge list saved: 0.01 seconds


In [13]:
DIP_edgelist_entrez_filt = gct.filter_converted_edgelist(DIP_edgelist_entrez)
gct.write_edgelist(DIP_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/DIP_Entrez.sif')

5537 input edges
510 self-edges removed
308 edges with un-mapped genes removed
26 duplicate edges removed
Edge list filtered: 0.01 seconds
4693 Edges remaining
Edge list saved: 0.01 seconds
