In [1]:
import pandas as pd
import time
import gene_conversion_tools as gct

## Load STRING Raw Data
#### Source: http://string-db.org/download/protein.links.v10.txt.gz
#### Source (detailed): http://string-db.org/download/protein.links.detailed.v10.txt.gz
#### File to download: The link labelled 'protein.links.v10.txt.gz' is simply the binary file version of the 'detailed' file. The detailed file documents the types of interactions and support for each interaction. It can be used for filtering in the future if desired, but will not be filtered on those categories currently.
Downloaded: July 28, 2016  
Last Updated: Apr 16, 2016	
Processing note: This data needs to be filtered for human-only interactions. This is a very long and large file, so we will parse the edges that are human-human interactions only by streaming the file. Then the resulting human-human interaction file will be read to be processed.

In [19]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
starttime=time.time()
g=open(wd+'Network_Data_Processed/STRING_human_v10.txt','w')
with open(wd+'Network_Data_Raw/STRING/protein.links.v10.txt') as f:
    for line in f:
        edge = line.split(' ')
        if edge[0].startswith('9606') and edge[1].startswith('9606'):
            g.write(edge[0].split('.')[1]+'\t'+edge[1].split('.')[1]+'\n')
print 'Filtered human-human STRING interactions only:', time.time()-starttime, 'seconds'
g.close()

Filtered human-human STRING interactions only: 1891.00401616 seconds


## Load human-filtered STRING edges

In [3]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
STRING_Raw = pd.read_csv(wd+'/Network_Data_Processed/STRING_human_v10.txt',sep='\t',header=-1)

In [4]:
STRING_Raw_Genes = list(set(STRING_Raw[0]).union(set(STRING_Raw[1])))

In [5]:
query_edgelist = STRING_Raw[[0,1]].values.tolist()

## Convert Genes

In [6]:
query_string, valid_genes, invalid_genes = gct.query_constructor(STRING_Raw_Genes)

19247 Valid Query Genes
0 Invalid Query Genes


In [7]:
# Set scopes (gene naming systems to search)
scopes = "ensemblprotein"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [8]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 29.56 seconds
19257 Matched query results


In [9]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 1427

3 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 122.22 seconds


## Construct Converted Network

In [10]:
STRING_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol)
STRING_edgelist_symbol_filt = gct.filter_converted_edgelist(STRING_edgelist_symbol)
gct.write_edgelist(STRING_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/STRING_Symbol.sif')

8548002 input edges
18386 self-edges removed
720410 edges with un-mapped genes removed
3907475 duplicate edges removed
Edge list filtered: 9.39 seconds
3901731 Edges remaining
Edge list saved: 6.23 seconds


In [11]:
STRING_edgelist_entrez = gct.convert_edgelist(query_edgelist, query_to_entrez)
STRING_edgelist_entrez_filt = gct.filter_converted_edgelist(STRING_edgelist_entrez)
gct.write_edgelist(STRING_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/STRING_Entrez.sif')

8548002 input edges
31218 self-edges removed
919628 edges with un-mapped genes removed
3801409 duplicate edges removed
Edge list filtered: 7.34 seconds
3795747 Edges remaining
Edge list saved: 2.5 seconds
