## Link Prediction 

- Treat the edges in the graph as **positive examples**
- Sample a number of non-existent edges (i.e., node pairs with no edges between them) as **negative examples**


In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp


In [1]:
import dgl.data

dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

Using backend: pytorch


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [3]:
u, v = g.edges()

In [15]:
len(u)

10556

In [4]:
u 

tensor([   0,    0,    0,  ..., 2707, 2707, 2707])

In [5]:
v

tensor([ 633, 1862, 2582,  ...,  598, 1473, 2706])

In [8]:
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
print(eids)

[1156 2720 3437 ... 1774 1393 9947]


In [9]:
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size

In [10]:
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

In [11]:
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy()))) # this is a sparse adjacency matrix

In [14]:
adj.todense().shape

(2708, 2708)

In [16]:
np.eye(g.number_of_nodes())

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [17]:
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
print(adj_neg)

[[0. 1. 1. ... 1. 1. 1.]
 [1. 0. 0. ... 1. 1. 1.]
 [1. 0. 0. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 1. 0. 0.]
 [1. 1. 1. ... 1. 0. 0.]]


In [18]:
neg_u, neg_v = np.where(adj_neg != 0)

In [19]:
print(neg_u)
print(neg_v)

[   0    0    0 ... 2707 2707 2707]
[   1    2    3 ... 2703 2704 2705]


In [22]:
len(neg_u)

7320000

In [20]:

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [21]:
print(len(train_pos_u), len(train_neg_u))
print(len(test_pos_u), len(test_neg_u))

9501 9501
1055 1055


In [23]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [24]:
from dgl.nn import SAGEConv 

class GraphSage(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSage, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [25]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [44]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # print(h.shape)
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # g.edata['score'] shape is [9501, 1]
            # print(g.edata['score'].shape)
            return g.edata['score'][:, 0]

In [45]:
model = GraphSage(train_g.ndata['feat'].shape[1], 16)

In [46]:
pred = DotPredictor()

In [47]:
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)


In [48]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

In [49]:
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

In epoch 0, loss: 0.6929579973220825
In epoch 5, loss: 0.66300368309021
In epoch 10, loss: 0.5759940147399902
In epoch 15, loss: 0.5208228230476379
In epoch 20, loss: 0.49238890409469604
In epoch 25, loss: 0.4600619673728943
In epoch 30, loss: 0.4374469518661499
In epoch 35, loss: 0.4156125783920288
In epoch 40, loss: 0.39391374588012695
In epoch 45, loss: 0.3731802701950073
In epoch 50, loss: 0.35113808512687683
In epoch 55, loss: 0.32943618297576904
In epoch 60, loss: 0.30703336000442505
In epoch 65, loss: 0.28436124324798584
In epoch 70, loss: 0.2617570757865906
In epoch 75, loss: 0.23912028968334198
In epoch 80, loss: 0.2165376991033554
In epoch 85, loss: 0.19412106275558472
In epoch 90, loss: 0.17237377166748047
In epoch 95, loss: 0.15139953792095184


In [51]:
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


AUC 0.8464311223916802
