In [1]:
import pandas as pd
import itertools
import gene_conversion_tools as gct

## Load BioGRID Raw Data
#### Source (MITAB): http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.141/BIOGRID-ORGANISM-3.4.141.tab2.zip
Downloaded: October 03, 2016  
Last Updated: October 01, 2016  
Notes for download: There is a new verision of BioGRID released on the first of every month. Download the organism specific files to extract only human interactions from the database.  
Notes for processing: This is the file for human protein interactions, however, not all interactions may be human-human interactions. These need to be filtered. Additionally, we may create an alternate version of BioGRID that only contain the PPI (physical) only interactions. Entrez IDs will be used for conversion.

In [2]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
BioGRID_Raw = pd.read_csv(wd+'Network_Data_Raw/BioGRID/BIOGRID-ORGANISM-3.4.141.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.4.141.tab2.txt',sep='\t')
print 'Raw edge count in DIP:', len(BioGRID_Raw)

Raw edge count in DIP: 328724


In [3]:
# Show not all interactions in BioGRID are physical PPI, though the overwhelming majority are
BioGRID_Raw['Experimental System Type'].value_counts()

physical    327061
genetic       1663
Name: Experimental System Type, dtype: int64

In [4]:
# Not all interactions are from Human
BioGRID_Raw['Organism Interactor A'].value_counts()

9606      306824
10090      18150
11676       1581
10116        564
559292       344
10376        244
37296        151
11103        126
10298        119
9986         107
8355          66
10359         64
7227          62
9913          60
3702          57
284812        34
6239          33
11723         25
9031          20
333760        17
11709         16
7955          15
9544           9
36329          6
9823           5
83332          4
9615           3
3988           3
32603          2
32604          2
9598           2
10245          2
10335          1
352472         1
237631         1
10310          1
9103           1
39947          1
121224         1
Name: Organism Interactor A, dtype: int64

In [5]:
# Not all interactions are from Human
BioGRID_Raw['Organism Interactor B'].value_counts()

9606      323411
10090       2488
559292      1028
10116        701
11676        314
3702         125
9913         100
10298         96
7227          80
37296         50
10376         50
9986          48
11103         41
8355          37
9031          23
10359         20
284812        18
10029         17
6239          15
9823          13
7955          13
11723          4
32603          4
9544           4
9615           3
11709          3
9598           2
83332          2
333760         2
180454         2
10310          2
3988           2
10141          2
36329          1
9796           1
7668           1
10245          1
Name: Organism Interactor B, dtype: int64

#### Since there are so few genetic interactions relative to physical interactions, we will not filter these edges. However, we will filter all interactions that are not labelled as human-human interactions

## Custom Processing of Raw DIP Data

#### Keep only human-human interactions

In [6]:
BioGRID_Human_Only = BioGRID_Raw[(BioGRID_Raw['Organism Interactor A']==9606) & (BioGRID_Raw['Organism Interactor B']==9606)]
print 'Human-Human only interactions in BioGRID 3.4.141:', len(BioGRID_Human_Only)

Human-Human only interactions in BioGRID 3.4.141: 301511


#### Parse all Entrez genes in filtered BioGRID

In [7]:
# Extract gene list
BioGRID_Human_Genes = list(set(BioGRID_Human_Only['Entrez Gene Interactor A']).union(set(BioGRID_Human_Only['Entrez Gene Interactor B'])))
BioGRID_Human_Genes = [str(gene) for gene in BioGRID_Human_Genes]

## Convert Genes

In [8]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(BioGRID_Human_Genes)

16102 Valid Query Genes
0 Invalid Query Genes


In [9]:
# Set scopes (gene naming systems to search)
scopes = "entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [10]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print len(match_list), 'Matched query results'

Batch query complete: 23.49 seconds
16102 Matched query results


In [11]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)

Queries without full matching results found: 3

0 Queries with mutliple matches found

Query mapping table/dictionary construction complete: 21.64 seconds


## Construct Converted Network

In [12]:
query_edgelist = BioGRID_Human_Only[['Entrez Gene Interactor A','Entrez Gene Interactor B']].values.tolist()
query_edgelist_fmt = [[str(edge[0]),str(edge[1])] for edge in query_edgelist]

In [13]:
BioGRID_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol)
BioGRID_edgelist_symbol_filt = gct.filter_converted_edgelist(BioGRID_edgelist_symbol)
gct.write_edgelist(BioGRID_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/BioGRID_Symbol.sif')

301511 input edges
4372 self-edges removed
9 edges with un-mapped genes removed
80390 duplicate edges removed
Edge list filtered: 0.53 seconds
216740 Edges remaining
Edge list saved: 0.38 seconds


In [14]:
BioGRID_edgelist_entrez = gct.convert_edgelist(query_edgelist_fmt, query_to_entrez)
BioGRID_edgelist_entrez_filt = gct.filter_converted_edgelist(BioGRID_edgelist_entrez)
gct.write_edgelist(BioGRID_edgelist_entrez_filt, wd+'Network_SIFs_Entrez/BioGRID_Entrez.sif')

301511 input edges
4372 self-edges removed
9 edges with un-mapped genes removed
80390 duplicate edges removed
Edge list filtered: 0.34 seconds
216740 Edges remaining
Edge list saved: 0.19 seconds
