In [41]:
import pandas as pd
import itertools
import time
import gene_conversion_tools as gct

## Load ConsensusPathDB Raw Data
#### Source: http://cpdb.molgen.mpg.de/download/ConsensusPathDB_human_PPI.gz
Downloaded: August 16, 2016  
Last Updated: September 01, 2015  
We only downloaded the protein interaction part of this database.  
#### From the website:  
"The protein interaction part of the integrated network is available for download. Due to several licensing issues, we are not allowed to release the complete integrated network (including signaling, metabolism and gene regulation).
Rows with one protein only describe self-interactions, and rows with more than two proteins describe complex interactions. In cases when proteins are annotated only with genomic identifiers but no protein identifiers in the according source databases, and if the genomic identifiers map to more than one UniProt entry, the according UniProt entry names are concatenated (e.g. RL40_HUMAN.RS27A_HUMAN.UBIQ_HUMAN) as it is unclear which of the gene products interact."  
#### Note about processing gene names:
The gene names here are all listed as UniProtKB-IDs, however, MyGene.Info does not map these ID names. Therefore, we will use the UniProtKB mapping table (ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.2015_03.gz) to convert from the given IDs to the UniProtKB accession numbers. From the README of ID Mapping found here (ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README): we will be using the first two columns. However, given the size of this file, we will first save the first two columns to a separate file via UNIX, and then filter only UniProtKB-ID's that are for human (ending in '_HUMAN'). This new table will be used to map UniProtKB-IDs to UniProbKB-ACs that will be used in the MyGene.Info batch query system. All edges with non-converted IDs to ACs are removed.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
CPDB_Raw = pd.read_csv(wd+'Network_Data_Raw/ConsensusPathDB_human_PPI',sep='\t',skiprows=1)

In [3]:
# Remove self-edges
CPDB_Raw_Interactions = CPDB_Raw[CPDB_Raw['interaction_participants'].str.contains(',')].reset_index(drop=True)

In [4]:
# Extract Binary interactions from lists of interactors
CPDB_interactions_lists = list(CPDB_Raw_Interactions['interaction_participants'])
CPDB_interactions_split = [interaction_list.split(',') for interaction_list in CPDB_interactions_lists]
CPDB_binary_interactions = [list(itertools.combinations(gene_list, 2)) for gene_list in CPDB_interactions_split]
full_CPDB_interaction_list = list(itertools.chain(*CPDB_binary_interactions))

## Processing UniProtKB ID mapping files

In [13]:
# Extract gene list
CPDB_Raw_Genes = list(set(itertools.chain.from_iterable(full_CPDB_interaction_list)))

In [48]:
starttime=time.time()
g=open(wd+'UniProt_ID_Map_filt.2015_03_HUMAN','w')
with open(wd+'UniProt_ID_Map_filt.2015_03') as f:
    for line in f:
        if line.endswith('_HUMAN\n'):
            g.write(line)
print 'Filtered UniProtIDs as human only:', time.time()-starttime, 'seconds'
g.close()

Filtered UniProtIDs as human only: 32.0485138893 seconds


In [58]:
UniProt_mapping_table = pd.read_csv(wd+'UniProt_ID_Map_filt.2015_03_HUMAN',sep=' ',header=-1)
UniProt_Map = UniProt_mapping_table.set_index(1)[0].to_dict()

In [85]:
query_edgelist = pd.DataFrame(data=full_CPDB_interaction_list).drop_duplicates().values.tolist()
print len(query_edgelist), "Raw ConsensusPathDB Edges"

607144 Raw ConsensusPathDB Edges


In [83]:
query_edgelist2 = [[UniProt_Map[edge[0]],UniProt_Map[edge[1]]] for edge in query_edgelist if (edge[0] in UniProt_Map) and (edge[1] in UniProt_Map)]

In [84]:
print len(query_edgelist2), "Converted ConsensusPathDB Edges"

595934 Converted ConsensusPathDB Edges


In [86]:
CPDB_Converted_Genes = list(set(itertools.chain.from_iterable(query_edgelist2)))

## Convert Genes

In [87]:
query_string, valid_genes, invalid_genes = gct.query_constructor(CPDB_Converted_Genes)

17209 Valid Query Genes
0 Invalid Query Genes


In [74]:
# filter edgelist because len(invalid_genes)>0
query_edgelist_filt = gct.filter_query_edgelist(query_edgelist2, invalid_genes)

0 / 0 edges with invalid nodes removed


In [89]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [90]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 35.98 seconds
17427 Matched query results


In [91]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1124

161 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 77.22 seconds


## Construct Converted Network

In [92]:
CPDB_edgelist_symbol = gct.convert_edgelist(query_edgelist2, query_to_symbol)
CPDB_edgelist_symbol_filt = gct.filter_converted_edgelist(CPDB_edgelist_symbol)
gct.write_edgelist(CPDB_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/ConsensusPathDB_Symbol.sif')

595934 input edges
1211 self-edges removed
15708 edges with un-mapped genes removed
57393 duplicate edges removed
Edge list filtered: 0.57 seconds
521622 Edges remaining
Edge list saved: 0.87 seconds


In [93]:
CPDB_edgelist_entrez = gct.convert_edgelist(query_edgelist2, query_to_entrez)
CPDB_edgelist_entrez_filt = gct.filter_converted_edgelist(CPDB_edgelist_entrez)
gct.write_edgelist(CPDB_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/ConsensusPathDB_Entrez.sif')

595934 input edges
1257 self-edges removed
16768 edges with un-mapped genes removed
57340 duplicate edges removed
Edge list filtered: 0.47 seconds
520569 Edges remaining
Edge list saved: 0.35 seconds
