In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from dgl import heterograph
import dgl.function as fn
import dgl.utils as dgl_utils
from functools import partial
from dgl.nn.pytorch import RelGraphConv


import numpy as np
import pygraphviz as pgv

from sklearn.model_selection import train_test_split
import networkx as nx

from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
import time
import utils
from base import BaseRGCN

from sklearn.preprocessing import OneHotEncoder

from prune import prune_graph
from sklearn.metrics import accuracy_score

Load graph as a dictionary

### Build Model

In [42]:
class EmbeddingLayer(nn.Module):
    def __init__(self, num_nodes, h_dim):
        super(EmbeddingLayer, self).__init__()
        # create an embedding value [0,1] for each node of the graph
        self.embedding = torch.nn.Embedding(num_nodes, h_dim)

    def forward(self, g, h, r, norm):
        return self.embedding(h.squeeze())

class RGCN(BaseRGCN):
    def build_input_layer(self):
        return EmbeddingLayer(self.num_nodes, self.h_dim)

    def build_hidden_layer(self, idx):
        # build a number of hidden layer according to the parameter
        # add a relu activation function except for the last layer
        act = F.relu if idx < self.num_hidden_layers - 1 else None
        return RelGraphConv(self.h_dim, self.h_dim, self.num_rels, 'basis',
                self.num_bases, activation=act, self_loop=True,
                dropout=self.dropout)

class LinkPredict(nn.Module):
    def __init__(self, in_dim, h_dim, num_rels, num_bases=-1,
                 num_hidden_layers=2, dropout=0, use_cuda=False, reg_param=0):
        """
        Arguments:
        - in_dim (int) -- input feature size
        - h_dim  (int) -- output feature size
        - num_rels (int) -- # relations
        - num_bases (int) -- # bases
        - num_hidden_layers (int) -- # hidden layers
        - dropout (float) -- [0,1] dropout rate
        - use_cuda (bool)
        - reg_param (float) -- regularization parameter
        """
        super(LinkPredict, self).__init__()
        # build RGCN layer
        # 2 x num_rels as both directions are considered
        self.rgcn = RGCN(in_dim, h_dim, h_dim, num_rels * 2, num_bases,
                         num_hidden_layers, dropout, use_cuda)
        # define regularization
        self.reg_param = reg_param
        # define relations and normalize them
        self.w_relation = nn.Parameter(torch.Tensor(num_rels, h_dim))
        nn.init.xavier_uniform_(self.w_relation,
                                gain=nn.init.calculate_gain('relu'))

    def calc_score(self, embedding, triplets):
        # apply DistMult for scoring
        # embedding contains the embedding values of the node after the 
        #   propagation within the RGCN Block layer
        # triplets contains all triples resulting from the negative sampling process
        s = embedding[triplets[:,0]]
        r = self.w_relation[triplets[:,1]]
        o = embedding[triplets[:,2]]
        score = torch.sum(s * r * o, dim=1)
        return score

    def forward(self, g, h, r, norm):
        return self.rgcn.forward(g, h, r, norm)

    def regularization_loss(self, embedding):
        return torch.mean(embedding.pow(2)) + torch.mean(self.w_relation.pow(2))

    def get_loss(self, g, embed, triplets, labels):
        # triplets is a list of data samples (positive and negative)
        # each row in the triplets is a 3-tuple of (source, relation, destination)
        
        # The score is computed with the value-by-value multiplication of
        #   the embedding values of data produced by the negative sampling process
        #   and sum them using the vertical dimension
        
        score = self.calc_score(embed, triplets)
        predict_loss = F.binary_cross_entropy_with_logits(score, labels)
        reg_loss = self.regularization_loss(embed)
        return predict_loss + self.reg_param * reg_loss

def node_norm_to_edge_norm(g, node_norm):
    g = g.local_var()
    # convert to edge norm
    g.ndata['norm'] = node_norm
    g.apply_edges(lambda edges : {'norm' : edges.dst['norm']})
    return g.edata['norm']

### Load the graph

In [17]:
graph = np.load('../data/clean/graph.wse.npy')
pruned = prune_graph(graph, disease_degree_thresh=0,
                drug_degree_thresh=50,
                gene_degree_thresh=200)

Initial # edges: 1159831
Pruning given degree threshold of 0 (disease), 50 (drug), and 200 (gene)...
Given 2 hops, 10366 neighboring nodes are obtained.
Removing 17455 nodes...
The final graph contains:
- 892239 edges
- 18181 nodes: 8916 diseases, 8876 genes, 389 drugs
- 311 covid-19-associated genes (out of 312)
- 62 covid-19 associated genes + drug targets (out of 62)


In [55]:
graph.max()

37892

In [56]:
num_nodes = len(list(set(np.unique(graph[:,0])).union(set(np.unique(graph[:,2])))))
num_rels = np.unique(pruned[:,1]).shape[0]
num_edges = pruned.shape[0]

In [34]:
train_val, test_data = train_test_split(pruned, test_size=0.2, random_state=0)
train_data, val_data = train_test_split(train_val, test_size=0.2, random_state=0)

train_val_data = train_val[train_val[:,1] == 0]
val_data = torch.LongTensor(val_data[val_data[:,1] == 0])
test_data = torch.LongTensor(test_data[test_data[:,1] == 0])

### Only focus on drug-disease edge for validation

In [57]:
test_graph, test_rel, test_norm = utils.build_test_graph(
        num_nodes, num_rels, train_val_data)

Test graph:
# nodes: 37893, # edges: 4152


In [45]:
test_deg = test_graph.in_degrees(
                range(test_graph.number_of_nodes())).float().view(-1, 1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel)
test_norm = node_norm_to_edge_norm(test_graph, torch.from_numpy(test_norm).view(-1, 1))

In [46]:
# model params
model_param = {
    'in_dim'   : num_nodes,
    'h_dim'    : 100, # output feature size
    'num_rels' : num_rels,
    'dropout'  : 0.2,
    'use_cuda' : True,
    'reg_param': 0.01
}
use_cuda = model_param['use_cuda']
torch.cuda.set_device(0)

In [47]:
# create the model
model = LinkPredict(in_dim   = model_param['in_dim'],
                    h_dim    = model_param['h_dim'],
                    num_rels = model_param['num_rels'],
                    dropout  = model_param['dropout'],
                    use_cuda = model_param['use_cuda'],
                    reg_param= model_param['reg_param'])
if use_cuda:
    model.cuda()

In [48]:
# build adj list and calculate degrees for sampling
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, train_data)

In [49]:
# optimizer
optimizer = torch.optim.Adam(model.parameters())

In [50]:
epoch = 0
epoch_mult_eval = 100 # multiplication of n epochs to indicate when to evaluate
best_mrr = 0
best_acc = 0
forward_time = []
backward_time = []
eval_batch = 1000
model_state_file = 'model_state.pth'

In [52]:
sample_graph_param = {
    'sample_size'  : 20000, # edges to sample
    'split_size'   : 0.5,
    'negative_rate': 5,
}

In [53]:
print("start training...")
while epoch < 5001:
    model.train()
    epoch += 1

    # Perform edge neighborhood sampling to generate training graph and data
    # The training stage is performed on a sample graph (not the entire graph)
    g, node_id, edge_type, node_norm, data, labels = \
        utils.generate_sampled_graph_and_labels(
            train_data, 
            sample_graph_param['sample_size'],
            sample_graph_param['split_size'],
            num_rels, 
            adj_list,
            degrees,
            sample_graph_param['negative_rate'])
    
    print("Finished edge sampling")
    
    # set node/edge feature
    node_id = torch.from_numpy(node_id).view(-1, 1).long()
    edge_type = torch.from_numpy(edge_type)
    edge_norm = node_norm_to_edge_norm(g, torch.from_numpy(node_norm).view(-1, 1))
    data, labels = torch.from_numpy(data), torch.from_numpy(labels)
    deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)
    if use_cuda:
        node_id, deg = node_id.cuda(), deg.cuda()
        edge_type, edge_norm = edge_type.cuda(), edge_norm.cuda()
        data, labels = data.cuda(), labels.cuda()

    t0 = time.time()
    embed = model(g, node_id, edge_type, edge_norm)
    loss = model.get_loss(g, embed, data, labels)
    t1 = time.time()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip gradients
    optimizer.step()
    t2 = time.time()

    forward_time.append(t1 - t0)
    backward_time.append(t2 - t1)
    print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f} | Forward {:.4f}s | Backward {:.4f}s".
              format(epoch, loss.item(), best_mrr, forward_time[-1], backward_time[-1]))

    optimizer.zero_grad()

    # validation
    if epoch % epoch_mult_eval == 0:
        print('Run evaluation...')
        # perform validation on CPU because full graph is too large
        if use_cuda: 
#             model.cpu()
            model.eval()
            embed = model(test_graph, test_node_id.cuda(), test_rel.cuda(), test_norm.cuda())
#             mrr = utils.calc_mrr(embed, model.w_relation, torch.LongTensor(train_val_data),
#                                  val_data, test_data, hits=[1, 3, 10], eval_bz=eval_batch,
#                                  eval_p='filtered')
            pred = (torch.sigmoid(model.calc_score(embed, data)) > 0.5).cpu()
            acc = accuracy_score(labels.cpu(),pred)
            print('The accuracy is {}. \n'.format(acc))
        # save best model
#         if mrr > best_mrr:
#             best_mrr = mrr
#             best_mrr = mrr
#             torch.save({'state_dict': model.state_dict(), 'epoch': epoch},
#                        model_state_file)
        if use_cuda:
            model.cuda()

        

start training...
# sampled nodes: 7999
# sampled edges: 20000
# nodes: 7999, # edges: 20000
Finished edge sampling
Epoch 0001 | Loss 2.3253 | Best MRR 0.0000 | Forward 0.3424s | Backward 0.0883s
# sampled nodes: 8056
# sampled edges: 20000
# nodes: 8056, # edges: 20000
Finished edge sampling
Epoch 0002 | Loss 2.0774 | Best MRR 0.0000 | Forward 0.0097s | Backward 0.0508s
# sampled nodes: 8018
# sampled edges: 20000
# nodes: 8018, # edges: 20000
Finished edge sampling
Epoch 0003 | Loss 1.9058 | Best MRR 0.0000 | Forward 0.0096s | Backward 0.0512s
# sampled nodes: 8013
# sampled edges: 20000
# nodes: 8013, # edges: 20000
Finished edge sampling
Epoch 0004 | Loss 1.7831 | Best MRR 0.0000 | Forward 0.0099s | Backward 0.0416s
# sampled nodes: 8044
# sampled edges: 20000
# nodes: 8044, # edges: 20000
Finished edge sampling
Epoch 0005 | Loss 1.6930 | Best MRR 0.0000 | Forward 0.0097s | Backward 0.0419s
# sampled nodes: 8050
# sampled edges: 20000
# nodes: 8050, # edges: 20000
Finished edge sam

KeyboardInterrupt: 

In [None]:
import utils
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, val_data.numpy())

g, node_id, edge_type, node_norm, data, labels = \
    utils.generate_sampled_graph_and_labels(
        val_data.numpy(), 
        1400,
        0.1,
        num_rels, 
        adj_list,
        degrees,
        5)

In [57]:
embed = model(test_graph, test_node_id.cuda(), test_rel.cuda(), test_norm.cuda())

In [58]:
pred = (torch.sigmoid(model.calc_score(embed, data)) > 0.5).cpu()

In [59]:
labels

tensor([1., 1., 1.,  ..., 0., 0., 0.], device='cuda:0')

In [62]:
accuracy_score(labels.cpu(),pred )

0.83085

### Binary Classification

In [147]:
test_triplets = test_data
model.cpu()
embedding = model(test_graph, test_node_id, test_rel, test_norm)
w = model.w_relation
eval_bz = 100
hits=[1, 3, 10]

In [148]:
embedding.shape

torch.Size([37908, 100])

In [149]:
torch.sigmoid(model.calc_score(embedding, test_triplets)).shape

torch.Size([786])

In [165]:
with torch.no_grad():
    s = test_triplets[:, 0]
    r = test_triplets[:, 1]
    o = test_triplets[:, 2]
    test_size = test_triplets.shape[0]

    # perturb subject
    (ranks_s, score_s) = perturb_and_get_raw_rank(embedding, w, o, r, s, test_size, eval_bz)
    # perturb object
    ranks_o, score_0 = perturb_and_get_raw_rank(embedding, w, s, r, o, test_size, eval_bz)

    ranks = torch.cat([ranks_s, ranks_o])
    ranks += 1 # change to 1-indexed

    mrr = torch.mean(1.0 / ranks.float())
    print("MRR (raw): {:.6f}".format(mrr.item()))

    for hit in hits:
        avg_count = torch.mean((ranks <= hit).float())
        print("Hits (raw) @ {}: {:.6f}".format(hit, avg_count.item()))

batch 0 / 8
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])


RuntimeError: expected device cpu but got device cuda:0

In [161]:
score_s

NameError: name 'score_s' is not defined

In [163]:
def perturb_and_get_raw_rank(embedding, w, a, r, b, test_size, batch_size=100):
    """ Perturb one element in the triplets
    """
    n_batch = (test_size + batch_size - 1) // batch_size
    ranks = []
    for idx in range(n_batch):
        print("batch {} / {}".format(idx, n_batch))
        batch_start = idx * batch_size
        batch_end = min(test_size, (idx + 1) * batch_size)
        batch_a = a[batch_start: batch_end]
        batch_r = r[batch_start: batch_end]
        print(batch_r)
        emb_ar = embedding[batch_a] * w[batch_r]
        emb_ar = emb_ar.transpose(0, 1).unsqueeze(2) # size: D x E x 1
        emb_c = embedding.transpose(0, 1).unsqueeze(1) # size: D x 1 x V
        # out-prod and reduce sum
        out_prod = torch.bmm(emb_ar, emb_c) # size D x E x V
        score = torch.sum(out_prod, dim=0) # size E x V
        score = torch.sigmoid(score)
        target = b[batch_start: batch_end]
        ranks.append(sort_and_rank(score, target))
    return torch.cat(ranks), score

In [164]:
def sort_and_rank(score, target):
    _, indices = torch.sort(score, dim=1, descending=True)
    indices = torch.nonzero(indices == target.view(-1, 1))
    indices = indices[:, 1].view(-1)
    return indices

In [205]:
score

NameError: name 'score' is not defined