In [3]:
import dgl
import numpy as np
import pandas as pd
import networkx as nx
from dgl.data.utils import save_graphs

In [4]:
include_side_effects = True

In [5]:
# nodes
genes = pd.read_csv('../data/clean/genes.nodes', header=None)
diseases = pd.read_csv('../data/clean/diseases.nodes', header=None)
if include_side_effects:
    drugs = pd.read_csv('../data/clean/drugs.nodes', header=None)
else:
    drugs = pd.read_csv('../data/clean/drugs.wse.nodes', header=None)

# edges
drug_disease = pd.read_csv('../data/clean/drug-disease-treat.edges')
drug_gene = {
    'carrier'  : pd.read_csv('../data/clean/drug-gene-carrier.edges'),
    'enzyme'   : pd.read_csv('../data/clean/drug-gene-enzyme.edges'),
    'target'   : pd.read_csv('../data/clean/drug-gene-target.edges'),
    'transport': pd.read_csv('../data/clean/drug-gene-transport.edges')
}
gene_gene = pd.read_csv('../data/clean/gene-gene-association.edges', header=None)
gene_disease = pd.read_csv('../data/clean/gene-disease-association.edges', header=None)
drug_drug = pd.read_csv('../data/clean/drug-drug-side_effects.edges', header=None)

In [6]:
num_drugs = drugs.shape[0]
num_diseases = diseases.shape[0]
num_genes = genes.shape[0]

In [7]:
print('number of drugs: {} \nnumber of diseases: {} \nnumber of gene {}'.format(num_drugs, num_diseases, num_gene))

number of drugs: 8094 
number of diseases: 11171 
number of gene 18643


### Mapping to numbers

In [12]:
# nodes
drug_num_dict = {drug: num for (drug, num) in zip(drugs[0], range(num_drugs))}
disease_num_dict = {disease: num for (disease, num) in zip(diseases[0], 
                                                           range(num_drugs, num_drugs + num_diseases))}
gene_num_dict = {gene: num for (gene, num) in 
                 zip(genes[0], range(num_diseases+num_drugs, num_drugs+ num_diseases + num_gene))}

In [13]:
list(drug_num_dict.values())[-1]

# gene_num_dict.values()

3778

In [14]:
# update drug-disease
drug_disease.head()

Unnamed: 0,DRUGBANK_ID,UMLS_CUI,drug_num,disease_num
0,DB14655,C0006142,8031,8355
1,DB12319,C0018099,7520,8853
2,DB12319,C0740394,7520,12711
3,DB13989,C0020538,7910,9031
4,DB06608,C0024530,4783,9292


In [16]:
drug_disease['drug_num'] = drug_disease['DRUGBANK_ID'].map(drug_num_dict)
drug_disease['disease_num'] = drug_disease['UMLS_CUI'].map(disease_num_dict)
drug_disease.head()

Unnamed: 0,DRUGBANK_ID,UMLS_CUI,drug_num,disease_num
0,DB14655,C0006142,8031,8355
1,DB12319,C0018099,7520,8853
2,DB12319,C0740394,7520,12711
3,DB13989,C0020538,7910,9031
4,DB06608,C0024530,4783,9292


In [17]:
# update drug-gene
for action in drug_gene:
    drug_gene[action]['gene_num'] = drug_gene[action]['GENES'].map(gene_num_dict)
    drug_gene[action]['drug_num'] = drug_gene[action]['DRUGS'].map(drug_num_dict)
drug_gene['carrier'].head()

Unnamed: 0,DRUGS,GENES,gene_num,drug_num
0,DB00023,SERPINA7,33701,21
1,DB00059,SERPINA7,33701,55
2,DB00070,ALB,19955,64
3,DB00126,ALB,19955,114
4,DB00132,FABP7,24382,120


In [18]:
# update gene-gene
gene_gene['gene1_num'] = gene_gene[0].map(gene_num_dict)
gene_gene['gene2_num'] = gene_gene[1].map(gene_num_dict)
gene_gene.head()

Unnamed: 0,0,1,2,gene1_num,gene2_num
0,ARF5,AP4S1,binding,20348,20251
1,ARF5,BET1,binding,20348,20966
2,ARF5,YKT6,binding,20348,37138
3,ARF5,VAMP4,binding,20348,36759
4,ARF5,CYTH4,activation,20348,23094


In [19]:
# update gene-disease
gene_disease['gene_num'] = gene_disease[0].map(gene_num_dict)
gene_disease['disease_num'] = gene_disease[1].map(disease_num_dict)
gene_disease.head()

Unnamed: 0,0,1,gene_num,disease_num
0,A1BG,C0019209,19277,8946
1,A1BG,C0036341,19277,9810
2,A2M,C0002395,19279,8159
3,A2M,C0007102,19279,8386
4,A2M,C0009375,19279,8498


In [20]:
# update drug-drug
drug_drug['drug1_num'] = drug_drug[0].map(drug_num_dict)
drug_drug['drug2_num'] = drug_drug[1].map(drug_num_dict)
drug_drug.head()

Unnamed: 0,0,1,drug1_num,drug2_num
0,DB00415,DB00813,398,788
1,DB01236,DB01223,1202,1189
2,DB00601,DB00303,580,288
3,DB08298,DB00811,6302,786
4,DB01173,DB00675,1140,652


### Construct graph as a np.array.
first col: first entity
second col: relation
third col: second entity

map relations to num

In [21]:
treat_rel = 0
drug_gene_dict = {rel:num for (rel, num) in zip(drug_gene.keys(), range(1,5))}
gene_gene_dict = {rel:num for (rel, num) in zip(gene_gene[2].unique(), range(5, 12))}
cause = 12
se = 13

In [22]:
# drug disease
drug_disease_arr = np.array([[drug_num, treat_rel, disease_num] for (drug_num, disease_num) in 
                    zip(drug_disease['drug_num'], drug_disease['disease_num'])])

In [24]:
# drug gene
drug_gene_arr = []
for action in drug_gene:
    drug_gene_list = [[drug_num, drug_gene_dict[action], gene_num] for (drug_num, gene_num) in 
                     zip(drug_gene[action]['drug_num'], drug_gene[action]['gene_num'])]
    drug_gene_arr.extend(drug_gene_list)
drug_gene_arr = np.array(drug_gene_arr)

In [25]:
# gene-gene
gene_gene_arr = []
for interaction in gene_gene_dict.keys():
    subset = gene_gene[gene_gene[2] == interaction]
    subset_list = [[gene_num1, gene_gene_dict[interaction], gene_num2] for (gene_num1, gene_num2) in 
                    zip(subset['gene1_num'], subset['gene2_num'])]
    gene_gene_arr.extend(subset_list)
gene_gene_arr = np.array(gene_gene_arr)

In [26]:
# gene disease
gene_disease_arr = np.array([(gene_num, cause, disease_num) for (gene_num, disease_num) in 
                    zip(gene_disease['gene_num'], gene_disease['disease_num'])])

In [27]:
# drug-drug
drug_drug_arr = np.array([(drug1_num, se, drug2_num) for (drug1_num, drug2_num) in 
                    zip(drug_drug['drug1_num'], drug_drug['drug2_num'])])

In [28]:
if include_side_effects:
    graph_arr = np.concatenate([drug_disease_arr, drug_gene_arr, gene_gene_arr, gene_disease_arr, drug_drug_arr])
else:
    graph_arr = np.concatenate([drug_disease_arr, drug_gene_arr, gene_gene_arr, gene_disease_arr])

In [29]:
if include_side_effects:
    np.save('../data/clean/graph', graph_arr)
else:
    np.save('../data/clean/graph.wse', graph_arr)
# graph_arr = np.load('../data/clean/graph.npy')

### Construct graph using dgl

In [82]:
graph_dict = {}

In [83]:
# drug-disease
drug_disease_list = [(drug_num, disease_num) for (drug_num, disease_num) in 
                    zip(drug_disease['drug_num'], drug_disease['disease_num'])]
print(drug_disease_list[:10])
graph_dict['drug', 'treat', 'disease'] = drug_disease_list

[(8031, 8355), (7520, 8853), (7520, 12711), (7910, 9031), (4783, 9292), (8046, 11479), (7691, 8535), (7711, 8478), (7719, 9527), (1363, 10085)]


In [84]:
# drug-gene
for action in drug_gene:
    drug_gene_list = [(drug_num, gene_num) for (drug_num, gene_num) in 
                     zip(drug_gene[action]['drug_num'], drug_gene[action]['gene_num'])]
    graph_dict['drug', action, 'gene'] = drug_gene_list
graph_dict['drug', 'carrier', 'gene'][:10]

[(21, 33705),
 (55, 33705),
 (64, 19955),
 (114, 19955),
 (120, 24385),
 (124, 25145),
 (125, 19955),
 (140, 19955),
 (141, 25145),
 (142, 24385)]

In [85]:
# gene-gene
gene_gene_list = [(gene_num1, gene_num2) for (gene_num1, gene_num2) in 
                    zip(gene_gene['gene1_num'], gene_gene['gene2_num'])]
gene_gene_list[:10]

[(20348, 20251),
 (20348, 20966),
 (20348, 37143),
 (20348, 36764),
 (20348, 23097),
 (20348, 22584),
 (20348, 20232),
 (20348, 20241),
 (20348, 20346),
 (20348, 32424)]

In [86]:
gene_interaction_types = gene_gene[2].unique()
gene_interaction_types

array(['binding', 'activation', 'catalysis', 'reaction', 'expression',
       'inhibition', 'ptmod'], dtype=object)

In [87]:
# subset gene_gene according to interaction type
for interaction in gene_interaction_types:
    subset = gene_gene[gene_gene[2] == interaction]
    subset_list = [(gene_num1, gene_num2) for (gene_num1, gene_num2) in 
                    zip(subset['gene1_num'], subset['gene2_num'])]
    graph_dict[('gene', interaction, 'gene')] = subset_list

In [88]:
# gene-disease
gene_disease_list = [(gene_num, disease_num) for (gene_num, disease_num) in 
                    zip(gene_disease['gene_num'], gene_disease['disease_num'])]
print(gene_disease_list[:10])
graph_dict['gene', 'associate', 'disease'] = gene_disease_list

[(19277, 9810), (29448, 8148), (29448, 8324), (29448, 9688), (29448, 12022), (29451, 8324), (29451, 8355), (29451, 9688), (29451, 11942), (29451, 12022)]


In [89]:
# drug-drug
drug_drug_list = [(drug1_num, drug2_num) for (drug1_num, drug2_num) in 
                    zip(drug_drug['drug1_num'], drug_drug['drug2_num'])]
print(drug_drug_list[:10])
graph_dict['drug', 'side_effect', 'drug'] = drug_drug_list

[(398, 788), (1202, 1189), (580, 288), (6302, 786), (1140, 652), (513, 870), (218, 702), (1170, 1070), (4404, 478), (1986, 724)]


In [90]:
for key in graph_dict:
    print(key)

('drug', 'treat', 'disease')
('drug', 'carrier', 'gene')
('drug', 'enzyme', 'gene')
('drug', 'target', 'gene')
('drug', 'transport', 'gene')
('gene', 'binding', 'gene')
('gene', 'activation', 'gene')
('gene', 'catalysis', 'gene')
('gene', 'reaction', 'gene')
('gene', 'expression', 'gene')
('gene', 'inhibition', 'gene')
('gene', 'ptmod', 'gene')
('gene', 'associate', 'disease')
('drug', 'side_effect', 'drug')


In [20]:
g = dgl.heterograph(graph_dict)

In [21]:
g.nodes

<dgl.view.HeteroNodeView at 0xa2746e410>

In [92]:
def save_dict_file(dictvar, file):
    with open(file,'w') as f:
        f.write(str(dictvar))

In [93]:
# save dict to a file
save_dict_file(graph_dict, '../data/clean/graph_dict.txt')

In [94]:
def read_dict_file(file):
    dic = ''
    with open(file,'r') as f:
        for line in f.readlines():
            dic = line # string
    dic = eval(dic)
    return(dic)

In [95]:
test_graph_dict = read_dict_file('../data/clean/graph_dict.txt')
print(test_graph_dict.keys())

dict_keys([('drug', 'treat', 'disease'), ('drug', 'carrier', 'gene'), ('drug', 'enzyme', 'gene'), ('drug', 'target', 'gene'), ('drug', 'transport', 'gene'), ('gene', 'binding', 'gene'), ('gene', 'activation', 'gene'), ('gene', 'catalysis', 'gene'), ('gene', 'reaction', 'gene'), ('gene', 'expression', 'gene'), ('gene', 'inhibition', 'gene'), ('gene', 'ptmod', 'gene'), ('gene', 'associate', 'disease'), ('drug', 'side_effect', 'drug')])


In [None]:
test_graph_dict