In [1]:
import dgl
import numpy as np
import pandas as pd
import networkx as nx
from dgl.data.utils import save_graphs
from utils import read_dict_file

Using backend: pytorch


In [2]:
# nodes
genes = pd.read_csv('../data/clean/genes.nodes', header=None)
diseases = pd.read_csv('../data/clean/diseases.nodes', header=None)
drugs = pd.read_csv('../data/clean/drugs.nodes', header=None)

# edges
drug_disease = pd.read_csv('../data/clean/drug-disease-treat.edges')
drug_gene = {
    'carrier'  : pd.read_csv('../data/clean/drug-gene-carrier.edges'),
    'enzyme'   : pd.read_csv('../data/clean/drug-gene-enzyme.edges'),
    'target'   : pd.read_csv('../data/clean/drug-gene-target.edges'),
    'transport': pd.read_csv('../data/clean/drug-gene-transport.edges')
}
gene_gene = pd.read_csv('../data/clean/gene-gene-association.edges', header=None)
gene_disease = pd.read_csv('../data/clean/gene-disease-association.edges', header=None)
drug_drug = pd.read_csv('../data/clean/drug-drug-side_effects.edges', header=None)

### Mapping to numbers

In [3]:
# nodes
gene_num_dict = {gene: num for (gene, num) in zip(genes[0], range(genes.shape[0]))}
drug_num_dict = {drug: num for (drug, num) in zip(drugs[0], range(drugs.shape[0]))}
disease_num_dict = {disease: num for (disease, num) in zip(diseases[0], range(diseases.shape[0]))}

In [4]:
# update drug-disease
drug_disease.head()

Unnamed: 0,DRUGBANK_ID,UMLS_CUI
0,DB14655,C0006142
1,DB12319,C0018099
2,DB12319,C0740394
3,DB13989,C0020538
4,DB06608,C0024530


In [5]:
drug_disease['drug_num'] = drug_disease['DRUGBANK_ID'].map(drug_num_dict)
drug_disease['disease_num'] = drug_disease['UMLS_CUI'].map(disease_num_dict)
drug_disease

Unnamed: 0,DRUGBANK_ID,UMLS_CUI,drug_num,disease_num
0,DB14655,C0006142,8031,261
1,DB12319,C0018099,7520,759
2,DB12319,C0740394,7520,4617
3,DB13989,C0020538,7910,937
4,DB06608,C0024530,4783,1198
...,...,...,...,...
7903,DB15575,C0007112,8083,295
7904,DB11712,C0392164,7377,3947
7905,DB11799,C0019693,7410,880
7906,DB12486,C0401149,7558,4076


In [6]:
# update drug-gene
for action in drug_gene:
    drug_gene[action]['gene_num'] = drug_gene[action]['GENES'].map(gene_num_dict)
    drug_gene[action]['drug_num'] = drug_gene[action]['DRUGS'].map(drug_num_dict)
drug_gene['carrier'].head()

Unnamed: 0,DRUGS,GENES,gene_num,drug_num
0,DB00023,SERPINA7,14440,21
1,DB00059,SERPINA7,14440,55
2,DB00070,ALB,690,64
3,DB00126,ALB,690,114
4,DB00132,FABP7,5120,120


In [7]:
# update gene-gene
gene_gene['gene1_num'] = gene_gene[0].map(gene_num_dict)
gene_gene['gene2_num'] = gene_gene[1].map(gene_num_dict)
gene_gene.head()

Unnamed: 0,0,1,2,gene1_num,gene2_num
0,ARF5,AP4S1,binding,1083,986
1,ARF5,BET1,binding,1083,1701
2,ARF5,YKT6,binding,1083,17878
3,ARF5,VAMP4,binding,1083,17499
4,ARF5,CYTH4,activation,1083,3832


In [8]:
# update gene-disease
gene_disease['gene_num'] = gene_disease[0].map(gene_num_dict)
gene_disease['disease_num'] = gene_disease[1].map(disease_num_dict)
gene_disease.head()

Unnamed: 0,0,1,gene_num,disease_num
0,A1BG,C0036341,12,1716
1,NAT1,C0001973,10183,54
2,NAT1,C0005684,10183,230
3,NAT1,C0033578,10183,1594
4,NAT1,C0376358,10183,3928


In [9]:
# update drug-drug
drug_drug['drug1_num'] = drug_drug[0].map(drug_num_dict)
drug_drug['drug2_num'] = drug_drug[1].map(drug_num_dict)
drug_drug.head()

Unnamed: 0,0,1,drug1_num,drug2_num
0,DB00415,DB00813,398,788
1,DB01236,DB01223,1202,1189
2,DB00601,DB00303,580,288
3,DB08298,DB00811,6302,786
4,DB01173,DB00675,1140,652


### Construct graph using dgl

In [10]:
graph_dict = {}

In [11]:
# drug-disease
drug_disease_list = [(drug_num, disease_num) for (drug_num, disease_num) in 
                    zip(drug_disease['drug_num'], drug_disease['disease_num'])]
print(drug_disease_list[:10])
graph_dict['drug', 'treat', 'disease'] = drug_disease_list

[(8031, 261), (7520, 759), (7520, 4617), (7910, 937), (4783, 1198), (8046, 3385), (7691, 441), (7711, 384), (7719, 1433), (1363, 1991)]


In [12]:
# drug-gene
for action in drug_gene:
    drug_gene_list = [(drug_num, gene_num) for (drug_num, gene_num) in 
                     zip(drug_gene[action]['drug_num'], drug_gene[action]['gene_num'])]
    graph_dict['drug', action, 'gene'] = drug_gene_list
graph_dict['drug', 'carrier', 'gene'][:10]

[(21, 14440),
 (55, 14440),
 (64, 690),
 (114, 690),
 (120, 5120),
 (124, 5880),
 (125, 690),
 (140, 690),
 (141, 5880),
 (142, 5120)]

In [13]:
# gene-gene
gene_gene_list = [(gene_num1, gene_num2) for (gene_num1, gene_num2) in 
                    zip(gene_gene['gene1_num'], gene_gene['gene2_num'])]
gene_gene_list[:10]

[(1083, 986),
 (1083, 1701),
 (1083, 17878),
 (1083, 17499),
 (1083, 3832),
 (1083, 3319),
 (1083, 967),
 (1083, 976),
 (1083, 1081),
 (1083, 13159)]

In [14]:
gene_interaction_types = gene_gene[2].unique()
gene_interaction_types

array(['binding', 'activation', 'catalysis', 'reaction', 'expression',
       'inhibition', 'ptmod'], dtype=object)

In [15]:
# subset gene_gene according to interaction type
for interaction in gene_interaction_types:
    subset = gene_gene[gene_gene[2] == interaction]
    subset_list = [(gene_num1, gene_num2) for (gene_num1, gene_num2) in 
                    zip(subset['gene1_num'], subset['gene2_num'])]
    graph_dict[('gene', interaction, 'gene')] = subset_list

In [16]:
# gene-disease
gene_disease_list = [(gene_num, disease_num) for (gene_num, disease_num) in 
                    zip(gene_disease['gene_num'], gene_disease['disease_num'])]
print(gene_disease_list[:10])
graph_dict['gene', 'associate', 'disease'] = gene_disease_list

[(12, 1716), (10183, 54), (10183, 230), (10183, 1594), (10183, 3928), (10186, 230), (10186, 261), (10186, 1594), (10186, 3848), (10186, 3928)]


In [17]:
# drug-drug
drug_drug_list = [(drug1_num, drug2_num) for (drug1_num, drug2_num) in 
                    zip(drug_drug['drug1_num'], drug_drug['drug2_num'])]
print(drug_drug_list[:10])
graph_dict['drug', 'side_effect', 'drug'] = drug_drug_list

[(398, 788), (1202, 1189), (580, 288), (6302, 786), (1140, 652), (513, 870), (218, 702), (1170, 1070), (4404, 478), (1986, 724)]


In [19]:
for key in graph_dict:
    print(key)

('drug', 'treat', 'disease')
('drug', 'carrier', 'gene')
('drug', 'enzyme', 'gene')
('drug', 'target', 'gene')
('drug', 'transport', 'gene')
('gene', 'binding', 'gene')
('gene', 'activation', 'gene')
('gene', 'catalysis', 'gene')
('gene', 'reaction', 'gene')
('gene', 'expression', 'gene')
('gene', 'inhibition', 'gene')
('gene', 'ptmod', 'gene')
('gene', 'associate', 'disease')
('drug', 'side_effect', 'drug')


In [20]:
g = dgl.heterograph(graph_dict)

In [21]:
g.nodes

<dgl.view.HeteroNodeView at 0xa2746e410>

In [32]:
def save_dict_file(dictvar, file):
    with open(file,'w') as f:
        f.write(str(dictvar))

In [33]:
# save dict to a file
save_dict_file(graph_dict, '../data/clean/graph_dict.txt')

In [3]:
def read_dict_file(file):
    dic = ''
    with open(file,'r') as f:
        for line in f.readlines():
            dic = line # string
    dic = eval(dic)
    return(dic)

In [2]:
test_graph_dict = read_dict_file('../data/clean/graph_dict.txt')
print(test_graph_dict.keys())

dict_keys([('drug', 'treat', 'disease'), ('drug', 'carrier', 'gene'), ('drug', 'enzyme', 'gene'), ('drug', 'target', 'gene'), ('drug', 'transport', 'gene'), ('gene', 'binding', 'gene'), ('gene', 'activation', 'gene'), ('gene', 'catalysis', 'gene'), ('gene', 'reaction', 'gene'), ('gene', 'expression', 'gene'), ('gene', 'inhibition', 'gene'), ('gene', 'ptmod', 'gene'), ('gene', 'associate', 'disease'), ('drug', 'side_effect', 'drug')])


In [8]:
dgl.heterograph(test_graph_dict).to_networkx()

AssertionError: 