In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

## Load STRING Raw Data
#### Source: http://string-db.org/download/protein.links.v10.5.txt.gz
#### Source (detailed): http://string-db.org/download/protein.links.detailed.v10.5.txt.gz
#### File to download: The link labelled 'protein.links.v10.5.txt.gz' is simply the binary file version of the 'detailed' file. The detailed file documents the types of interactions and support for each interaction. It can be used for filtering in the future if desired, but will not be filtered on those categories currently.
Downloaded: June 15, 2016  
Last Updated: May 14, 2017	
Processing note: This data needs to be filtered for human-only interactions. This is a very long and large file, so we will parse the edges that are human-human interactions only by streaming the file. Then the resulting human-human interaction file will be read to be processed.

In [4]:
# Load and filter STRING for only human-human protein interactions
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
starttime=time.time()
g=open(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt','w')
with open(wd+'Network_Data_Raw/STRING/protein.links.v10.5.txt') as f:
    for line in f:
        edge = line.split(' ')
        if edge[0].startswith('9606') and edge[1].startswith('9606'):
            g.write(edge[0].split('.')[1]+'\t'+edge[1].split('.')[1]+'\t'+edge[2]+'\n')
print 'Filtered human-human STRING interactions only:', time.time()-starttime, 'seconds'
g.close()

Filtered human-human STRING interactions only: 1793.17046094 seconds


## Load human-filtered STRING edges

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
STRING_Raw = pd.read_csv(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt',sep='\t',header=-1)
STRING_Raw.columns = ['NodeA', 'NodeB', 'Score']
print 'Raw Edges in STRING v10.5:', len(STRING_Raw)

Raw Edges in STRING v10.5: 11353056


In [3]:
STRING_Raw_filt = STRING_Raw.drop_duplicates()
print 'Edges in STRING v10.5 after dropping duplicates:', len(STRING_Raw_filt)

Edges in STRING v10.5 after dropping duplicates: 11353056


In [4]:
STRING_Genes = list(set(STRING_Raw_filt['NodeA']).union(set(STRING_Raw_filt['NodeB'])))

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


In [5]:
query_edgelist = STRING_Raw_filt[['NodeA', 'NodeB', 'Score']].values.tolist()

## Convert Genes from Ensembl Protein to Hugo Symbol

In [6]:
query_string, valid_genes, invalid_genes = gct.query_constructor(STRING_Genes)

19576 Valid Query Genes
0 Invalid Query Genes


In [7]:
# Set scopes (gene naming systems to search)
scopes = "ensemblprotein"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 23.11 seconds
19578 Matched query results


In [8]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1584

1 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 115.61 seconds


## Construct Converted Network

In [9]:
%%time
# Convert weighted edge list
STRING_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)

CPU times: user 26.7 s, sys: 2.74 s, total: 29.5 s
Wall time: 29.2 s


In [10]:
# Filter converted edge list
STRING_edgelist_symbol_filt = gct.filter_converted_edgelist(STRING_edgelist_symbol, weighted=True)

11353056 input edges
30268 self-edges removed
1043874 edges with un-mapped genes removed
5143146 duplicate edges removed
Edge list filtered: 77.42 seconds
5135768 Edges remaining


In [11]:
# Write network to file
gct.write_edgelist(STRING_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/STRING_Symbol.sif', binary=False)

Edge list saved: 8.28 seconds


In [12]:
# Create filtered network
STRING90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/STRING_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                    q=0.9, delimiter='\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/STRING90_Symbol.sif')

90.0% score: 497.0
513035 / 5135768 edges retained
