In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
import pandas as pd
import itertools

## Load BioGRID Raw Data
#### Source (MITAB): http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.149/BIOGRID-ORGANISM-3.4.149.tab2.zip
Downloaded: June 15, 2017  
Last Updated: June 01, 2017  
Notes for download: There is a new verision of BioGRID released on the first of every month. Download the organism specific files to extract only human interactions from the database.  
Notes for processing: This is the file for human protein interactions, however, not all interactions may be human-human interactions. These need to be filtered. There is a column for "Score" filtering, but it seems that most of these values are missing so they will be ignored for processing BioGRID

In [3]:
wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'
BioGRID_Raw = pd.read_csv(wd+'Network_Data_Raw/BioGRID/BIOGRID-ORGANISM-3.4.149.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.4.149.tab2.txt',sep='\t', low_memory=False)
print 'Raw edge count in BioGRID:', len(BioGRID_Raw)

Raw edge count in BioGRID: 394749


In [4]:
# Show not all interactions in BioGRID are physical PPI, though the overwhelming majority are
BioGRID_Raw['Experimental System Type'].value_counts()

physical    392779
genetic       1970
Name: Experimental System Type, dtype: int64

In [27]:
# Not all interactions are from Human
BioGRID_Raw['Organism Interactor A'].value_counts().head()

9606      372979
10090      17963
11676       1591
10116        570
559292       355
Name: Organism Interactor A, dtype: int64

In [28]:
# Not all interactions are from Human
BioGRID_Raw['Organism Interactor B'].value_counts().head()

9606      389334
10090       2543
559292      1045
10116        708
11676        318
Name: Organism Interactor B, dtype: int64

#### Since there are so few genetic interactions relative to physical interactions, we will not filter these edges. However, we will filter all interactions that are not labelled as human-human interactions

#### Keep only human-human interactions

In [9]:
BioGRID_Human_Only = BioGRID_Raw[(BioGRID_Raw['Organism Interactor A']==9606) & (BioGRID_Raw['Organism Interactor B']==9606)]
print 'Human-Human only interactions in BioGRID 3.4.149:', len(BioGRID_Human_Only)

Human-Human only interactions in BioGRID 3.4.149: 367564


In [29]:
# Any missing symbol names in column A?
BioGRID_Human_Only['Official Symbol Interactor A'][BioGRID_Human_Only['Official Symbol Interactor A']=='-']

Series([], Name: Official Symbol Interactor A, dtype: object)

In [30]:
# Any missing symbol names in column B?
BioGRID_Human_Only['Official Symbol Interactor B'][BioGRID_Human_Only['Official Symbol Interactor B']=='-']

Series([], Name: Official Symbol Interactor B, dtype: object)

In [32]:
# Convert table of interactions to edgelist (no scores given)
# Also no gene symbol conversion necessary because network is given in symbol format already
BioGRID_edgelist = BioGRID_Human_Only[['Official Symbol Interactor A', 'Official Symbol Interactor B']].values.tolist()
print 'Edges in BioGRID:', len(BioGRID_edgelist)

Edges in BioGRID: 367564


In [33]:
# Sort each edge representation for filtering
BioGRID_edgelist_sorted = [sorted(edge) for edge in BioGRID_edgelist]

In [34]:
# Filter edgelist for duplicate nodes and for self-edges
BioGRID_edgelist_filt = gct.filter_converted_edgelist(BioGRID_edgelist_sorted)

367564 input edges
4598 self-edges removed
0 edges with un-mapped genes removed
104709 duplicate edges removed
Edge list filtered: 0.29 seconds
258257 Edges remaining


In [37]:
# Save genelist to file
outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'
gct.write_edgelist(BioGRID_edgelist_filt, outdir+'BioGRID_Symbol.sif')

Edge list saved: 0.21 seconds
