In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from dgl import heterograph
import dgl.function as fn
import dgl.utils as dgl_utils
from functools import partial
from dgl.nn.pytorch import RelGraphConv
from dgl.contrib.data import load_data

import numpy as np
import pygraphviz as pgv

from sklearn.model_selection import train_test_split

import time

import utils
from base import BaseRGCN

load graph from dictionary

In [2]:
# graph_dict = utils.read_dict_file('../data/clean/graph_dict.txt')
# g = heterograph(graph_dict)

In [3]:
def plot_graph(nxg):
    ag = pgv.AGraph(strict=False, directed=True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    ag.layout('dot')
    ag.draw('graph.png')
    ag.edge_attr['front_size']=0.1

plot_graph(g.metagraph)

NameError: name 'g' is not defined

edge types

### Build Model

In [13]:
class EmbeddingLayer(nn.Module):
    def __init__(self, num_nodes, h_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = torch.nn.Embedding(num_nodes, h_dim)

    def forward(self, g, h, r, norm):
        return self.embedding(h.squeeze())

class RGCN(BaseRGCN):
    def build_input_layer(self):
        return EmbeddingLayer(self.num_nodes, self.h_dim)

    def build_hidden_layer(self, idx):
        act = F.relu if idx < self.num_hidden_layers - 1 else None
        return RelGraphConv(self.h_dim, self.h_dim, self.num_rels, 'basis',
                self.num_bases, activation=act, self_loop=True,
                dropout=self.dropout)

class LinkPredict(nn.Module):
    def __init__(self, in_dim, h_dim, num_rels, num_bases=-1,
                 num_hidden_layers=1, dropout=0, use_cuda=False, reg_param=0):
        super(LinkPredict, self).__init__()
        self.rgcn = RGCN(in_dim, h_dim, h_dim, num_rels * 2, num_bases,
                         num_hidden_layers, dropout, use_cuda)
        self.reg_param = reg_param
        self.w_relation = nn.Parameter(torch.Tensor(num_rels, h_dim))
        nn.init.xavier_uniform_(self.w_relation,
                                gain=nn.init.calculate_gain('relu'))

    def calc_score(self, embedding, triplets):
        # DistMult
        s = embedding[triplets[:,0]]
        r = self.w_relation[triplets[:,1]]
        o = embedding[triplets[:,2]]
        score = torch.sum(s * r * o, dim=1)
        return score

    def forward(self, g, h, r, norm):
        return self.rgcn.forward(g, h, r, norm)

    def regularization_loss(self, embedding):
        return torch.mean(embedding.pow(2)) + torch.mean(self.w_relation.pow(2))

    def get_loss(self, g, embed, triplets, labels):
        # triplets is a list of data samples (positive and negative)
        # each row in the triplets is a 3-tuple of (source, relation, destination)
        score = self.calc_score(embed, triplets)
        predict_loss = F.binary_cross_entropy_with_logits(score, labels)
        reg_loss = self.regularization_loss(embed)
        return predict_loss + self.reg_param * reg_loss

def node_norm_to_edge_norm(g, node_norm):
    g = g.local_var()
    # convert to edge norm
    g.ndata['norm'] = node_norm
    g.apply_edges(lambda edges : {'norm' : edges.dst['norm']})
    return g.edata['norm']

In [14]:
data = load_data('FB15k-237')

# entities: 14541
# relations: 237
# edges: 272115


In [6]:
num_nodes = data.num_nodes
train_data = data.train
valid_data = data.valid
test_data = data.test
num_rels = data.num_rels

### load the graph

In [15]:
graph = np.load('../data/clean/graph.npy')

In [16]:
num_nodes = len(list(set(np.unique(graph[:,0])).union(set(np.unique(graph[:,2])))))
num_rels = np.unique(graph[:,1]).shape[0]
num_edges = graph.shape[0]

In [17]:
big_graph, small_graph = train_test_split(graph, test_size=0.2, random_state=0)

In [18]:
train_val, test_data = train_test_split(small_graph, test_size=0.2, random_state=0)

In [19]:
train_data, val_data = train_test_split(train_val, test_size=0.2, random_state=0)

In [21]:
n_hidden = 100
n_bases = 100
n_layers = 2
dropout = 0.2
regularization =  0.01

use_cuda = False

In [22]:
model = LinkPredict(num_nodes,
                        n_hidden,
                        num_rels,
                        num_bases=n_bases,
                        num_hidden_layers=n_layers,
                        dropout=dropout,
                        use_cuda=use_cuda,
                        reg_param=regularization)
if use_cuda:
    model.cuda()

In [23]:
test_graph, test_rel, test_norm = utils.build_test_graph(
        num_nodes, num_rels, train_data)

Test graph:
# nodes: 37908, # edges: 310304


  norm = 1.0 / in_deg


In [24]:
test_deg = test_graph.in_degrees(
                range(test_graph.number_of_nodes())).float().view(-1,1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel)
test_norm = node_norm_to_edge_norm(test_graph, torch.from_numpy(test_norm).view(-1, 1))

In [25]:
# build adj list and calculate degrees for sampling
adj_list, degrees = utils.get_adj_and_degrees(num_nodes, train_data)

In [26]:
# optimizer
optimizer = torch.optim.Adam(model.parameters())

model_state_file = 'model_state.pth'
forward_time = []
backward_time = []

In [29]:
# training loop
print("start training...")

epoch = 0
best_mrr = 0

graph_batch_size = 500
graph_split_size = 0.5
negative_sample = 10
edge_sampler = 'uniform'
grad_norm = 1.0
evaluate_every = 1
n_epochs = 10
eval_batch_size = 100
eval_protocol = 'filtered'

while True:
    model.train()
    epoch += 1

    # perform edge neighborhood sampling to generate training graph and data
    g, node_id, edge_type, node_norm, data, labels = \
        utils.generate_sampled_graph_and_labels(
            train_data, graph_batch_size, graph_split_size,
            num_rels, adj_list, degrees, negative_sample,
            edge_sampler)
    print("Done edge sampling")

    # set node/edge feature
    node_id = torch.from_numpy(node_id).view(-1, 1).long()
    edge_type = torch.from_numpy(edge_type)
    edge_norm = node_norm_to_edge_norm(g, torch.from_numpy(node_norm).view(-1, 1))
    data, labels = torch.from_numpy(data), torch.from_numpy(labels)
    deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)
    if use_cuda:
        node_id, deg = node_id.cuda(), deg.cuda()
        edge_type, edge_norm = edge_type.cuda(), edge_norm.cuda()
        data, labels = data.cuda(), labels.cuda()

    t0 = time.time()
    embed = model(g, node_id, edge_type, edge_norm)
    loss = model.get_loss(g, embed, data, labels)
    t1 = time.time()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm) # clip gradients
    optimizer.step()
    t2 = time.time()

    forward_time.append(t1 - t0)
    backward_time.append(t2 - t1)
    print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f} | Forward {:.4f}s | Backward {:.4f}s".
          format(epoch, loss.item(), best_mrr, forward_time[-1], backward_time[-1]))

    optimizer.zero_grad()

    # validation
    if epoch % evaluate_every == 0:
        # perform validation on CPU because full graph is too large
        if use_cuda:
            model.cpu()

        model.eval()
        print("start eval")
        embed = model(test_graph, test_node_id, test_rel, test_norm)
        mrr = utils.calc_mrr(embed, model.w_relation, torch.LongTensor(train_data),
                             val_data, test_data, hits=[1, 3, 10], eval_bz=eval_batch_size,
                             eval_p=eval_protocol)
        # save best model
        if mrr < best_mrr:
            if epoch >= n_epochs:
                break
        else:
            best_mrr = mrr
            torch.save({'state_dict': model.state_dict(), 'epoch': epoch},
                       model_state_file)
        if use_cuda:
            model.cuda()

print("training done")
print("Mean forward time: {:4f}s".format(np.mean(forward_time)))
print("Mean Backward time: {:4f}s".format(np.mean(backward_time)))

print("\nstart testing:")
# use best model checkpoint
checkpoint = torch.load(model_state_file)
if use_cuda:
    model.cpu() # test on CPU
model.eval()
model.load_state_dict(checkpoint['state_dict'])
print("Using best epoch: {}".format(checkpoint['epoch']))
embed = model(test_graph, test_node_id, test_rel, test_norm)
utils.calc_mrr(embed, model.w_relation, torch.LongTensor(train_data), valid_data,
               test_data, hits=[1, 3, 10], eval_bz=eval_batch_size, eval_p=eval_protocol)

start training...
# sampled nodes: 925
# sampled edges: 500
# nodes: 925, # edges: 500
Done edge sampling
Epoch 0001 | Loss 2.0072 | Best MRR 0.0000 | Forward 0.5069s | Backward 0.0610s
start eval


TypeError: expected Tensor as element 1 in argument 0, but got numpy.ndarray

In [None]:
model(test_graph, test_node_id, test_rel, test_norm)

### Try Pytorch-Geometric

In [3]:
from sklearn.metrics import accuracy_score
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Entities
from torch_geometric.nn import RGCNConv

In [4]:
big_graph, small_graph = train_test_split(graph, test_size=0.2, random_state=0)

In [5]:
num_nodes = len(list(set(np.unique(graph[:,0])).union(set(np.unique(graph[:,2])))))
num_rels = np.unique(graph[:,1]).shape[0]
num_edges = graph.shape[0]

In [6]:
num_nodes

37908

In [7]:
edge_index = torch.zeros((2, num_edges))
edge_index[0,:] = torch.tensor(graph[:,0])
edge_index[1,:] = torch.tensor(graph[:,2])
edge_type = torch.tensor(graph[:,1])
edge_norm = torch.ones(num_edges)

In [8]:
device = torch.device('cuda')
edge_index = edge_index.to(device).long()
edge_type = edge_type.to(device).long()
edge_norm = edge_norm.to(device).long()

In [9]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = RGCNConv(
            num_nodes-1, 1, num_rels, num_bases=10)
        self.conv2 = RGCNConv(
            16, 3, num_rels, num_bases=10)

    def forward(self, edge_index, edge_type):
        x = F.relu(self.conv1(None, edge_index, edge_type))
        
        x = self.conv2(x, edge_index, edge_type)
        return F.log_softmax(x, dim=1)

model = Net().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)

In [10]:
edge_type.shape

torch.Size([1212123])

In [None]:
model.train()
model(edge_index, edge_type)

In [43]:
len(list(set(graph[:,0]).union(set(graph[:,2]))))

37908

In [58]:
edge_index.max()

tensor(37907, device='cuda:0')