In [86]:
from network_evaluation_tools import gene_conversion_tools as gct
import re
import pandas as pd
import itertools
import time

## Load ConsensusPathDB Raw Data
#### Source: http://cpdb.molgen.mpg.de/download/ConsensusPathDB_human_PPI.gz
Downloaded: June 15, 2017  
Last Updated: January 11, 2017  
We only downloaded the protein interaction part of this database.  
#### From the website:  
"The protein interaction part of the integrated network is available for download. Due to several licensing issues, we are not allowed to release the complete integrated network (including signaling, metabolism and gene regulation).
Rows with one protein only describe self-interactions, and rows with more than two proteins describe complex interactions. In cases when proteins are annotated only with genomic identifiers but no protein identifiers in the according source databases, and if the genomic identifiers map to more than one UniProt entry, the according UniProt entry names are concatenated (e.g. RL40_HUMAN.RS27A_HUMAN.UBIQ_HUMAN) as it is unclear which of the gene products interact."  
#### Note about processing gene names:
The gene names here are all listed as UniProtKB-IDs, however, MyGene.Info does not map these ID names. Therefore, we will use the UniProtKB mapping table (ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz) to convert from the UniProtKB-IDs to the gene name. We will only be using the human mappings. UniProtKB-IDs with Gene Names mapped to the same UniProt Accession number in the first column will be mapped together. 
#### Note about 'interaction confidence'
This column could potentially be used to filter interactions, but since we include the co-complex interactions in the list of interactors and these do not have an interaction confidence, we do not consider this scoring for a filtering of ConsensusPathDB

In [109]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
CPDB_Raw = pd.read_csv(wd+'Network_Data_Raw/ConsensusPathDB_human_PPI_v32',sep='\t',skiprows=1)
print CPDB_Raw.shape[0], 'raw interactions in ConsensusPathDB'

291415 raw interactions in ConsensusPathDB


In [106]:
# Get all interaction from CPDB
CPDB_Raw_Interactions = list(CPDB_Raw['interaction_participants'])

In [107]:
# Remove self-edges from CPDB interactions
CPDB_Raw_Interactions_filt = []
for interaction in CPDB_Raw_Interactions:
    interaction_split = re.split(',|\.', interaction)
    if len(interaction_split) > 1:
        CPDB_Raw_Interactions_filt.append(interaction_split)

In [110]:
# Extract Binary interactions from lists of interactors (multi-protein complex interactions form cliques)
CPDB_binary_interactions = [list(itertools.combinations(gene_list, 2)) for gene_list in CPDB_Raw_Interactions_filt]
full_CPDB_interaction_list = list(itertools.chain(*CPDB_binary_interactions))
print 'Binary, non-self interactions in ConsensusPathDB v32:', len(full_CPDB_interaction_list)

Binary, non-self interactions in ConsensPathDB v32: 2523991


## Processing UniProtKB ID mapping files

In [116]:
# Load UniProt idmapping File
UniProt_ID_map = pd.read_csv('/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/HUMAN_9606_idmapping.dat', sep='\t', header=-1)

In [117]:
# Construct UniProtKB to UniProt Accession
UniProt_ID_map_filt = UniProt_ID_map[(UniProt_ID_map[1]=='UniProtKB-ID')]
UniProt_ID_map_filt = UniProt_ID_map_filt.set_index(2)
UniProtKB_ID_map = UniProt_ID_map_filt[0].to_dict()

In [123]:
# Convert CPDB interaction list to UniProt Accessions (if any interactions do not convert, do not keep the interaction)
CPDB_UniProtID_edgelist = []
for edge in full_CPDB_interaction_list:
    if (edge[0] in UniProtKB_ID_map) & (edge[1] in UniProtKB_ID_map):
        converted_edge = sorted([UniProtKB_ID_map[edge[0]], UniProtKB_ID_map[edge[1]]])
        CPDB_UniProtID_edgelist.append(converted_edge)
print len(CPDB_UniProtID_edgelist), 'interactions converted to UniProt Accession IDs'

2501019 interactions converted to UniProt Accession IDs


In [126]:
# Remove self-edges and duplicate edges after conversion
CPDB_UniProtID_edgelist_filt = gct.filter_converted_edgelist(CPDB_UniProtID_edgelist)

2501019 input edges
26 self-edges removed
0 edges with un-mapped genes removed
788438 duplicate edges removed
Edge list filtered: 2.11 seconds
1712555 Edges remaining


In [127]:
CPDB_Converted_Genes = list(set(itertools.chain.from_iterable(CPDB_UniProtID_edgelist)))

## Convert Uniprot Accession IDs to HUGO Gene Symbol / Entrez ID

In [131]:
query_string, valid_genes, invalid_genes = gct.query_constructor(CPDB_Converted_Genes)

18642 Valid Query Genes
0 Invalid Query Genes


In [132]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"
# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 28.73 seconds
18976 Matched query results


In [133]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1280

228 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 81.04 seconds


In [137]:
# Convert UniProt Accession ID CPDB edgelist to gene symbols
CPDB_edgelist_symbol = gct.convert_edgelist(CPDB_UniProtID_edgelist_filt, query_to_symbol)
CPDB_edgelist_symbol_filt = gct.filter_converted_edgelist(CPDB_edgelist_symbol)

1712555 input edges
1620 self-edges removed
32579 edges with un-mapped genes removed
29930 duplicate edges removed
Edge list filtered: 1.12 seconds
1648426 Edges remaining


In [138]:
# Save CPDB as gene symbol network
gct.write_edgelist(CPDB_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/ConsensusPathDB_Symbol.sif')

Edge list saved: 2.2 seconds
