## Exploring BioGRID COVID-19 dataset

In this notebook I explore the dataset, filter it by species and create a graph object for visualization in d3.js

In [1]:
import pandas as pd
import networkx as nx
from networkx.readwrite import json_graph
import json
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
interact = pd.read_table("./data/BIOGRID-CORONAVIRUS-3.5.185.tab3.txt")

In [3]:
interact.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism Name Interactor A',
       'Organism ID Interactor B', 'Organism Name Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Ter

In [4]:
interact.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,REFSEQ Accessions Interactor A,SWISS-PROT Accessions Interactor B,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types
0,798421,1387,3661,107777,109869,-,-,CREBBP,IRF3,CBP|KAT3A|RSTS,...,NP_001073315|NP_004371,Q14653,E2GIM7|E2GIM8|E2GIM5|E2GIM9|M0QYT9|E2GIM6,NP_001562|NP_001184056|NP_001184057|NP_0011840...,-,-,-,-,-,-
1,798422,3661,1387,109869,107777,-,-,IRF3,CREBBP,-,...,NP_001562|NP_001184056|NP_001184057|NP_0011840...,Q92793,-,NP_001073315|NP_004371,-,-,-,-,-,-
2,908541,59272,59272,121864,121864,UNQ868/PRO1885,UNQ868/PRO1885,ACE2,ACE2,ACEH,...,NP_068576,Q9BYF1,-,NP_068576,-,-,-,-,-,-
3,935460,3661,3661,109869,109869,-,-,IRF3,IRF3,-,...,NP_001562|NP_001184056|NP_001184057|NP_0011840...,Q14653,E2GIM7|E2GIM8|E2GIM5|E2GIM9|M0QYT9|E2GIM6,NP_001562|NP_001184056|NP_001184057|NP_0011840...,-,-,-,-,-,-
4,935461,7187,57506,113039,121570,-,-,TRAF3,MAVS,CAP-1|CAP1|CD40bp|CRAF1|IIAE5|LAP1,...,NP_663777|NP_003291|NP_001186356|NP_663778,Q7Z434,M1P2Z0,NP_065797|NP_001193420,-,-,-,-,-,-


In [5]:
interest = ['9606', '2697049']
interact_subset = interact.query("`Organism ID Interactor A`=='2697049' & `Organism ID Interactor B`.isin(@interest)")

In [6]:
interact_subset.shape

(354, 37)

In [7]:
cols = ['Organism ID Interactor A', 'Official Symbol Interactor A']
interact_subset['Protein_A'] = interact_subset[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

cols = ['Organism ID Interactor B', 'Official Symbol Interactor B']
interact_subset['Protein_B'] = interact_subset[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [8]:
onlyInter = interact_subset[['Protein_A', 'Protein_B']]
onlyInter

Unnamed: 0,Protein_A,Protein_B
14,2697049_E,9606_AP3B1
15,2697049_E,9606_BRD4
16,2697049_E,9606_BRD2
17,2697049_E,9606_CWC27
18,2697049_E,9606_ZC3H18
...,...,...
483,2697049_nsp12,2697049_nsp7
484,2697049_nsp12,2697049_nsp8
485,2697049_nsp12,2697049_nsp7
486,2697049_nsp12,2697049_nsp8


In [9]:
G=nx.from_pandas_edgelist(onlyInter, 'Protein_A', 'Protein_B')

In [10]:
degrees = dict(nx.degree(G))

In [11]:
for key, value in degrees.items():
    degrees[key] = value*1.5

In [12]:
data = json_graph.node_link_data(G)
for i in data['nodes']:
    i['species'] = int(re.sub('\_.*', '', i['id']))
    accessor = i['id']
    i['degree'] = degrees[accessor]
    
    
with open('./data/biogrid.json', 'w') as fp:
    json.dump(data, fp)