In [1]:
import numpy as np

from torch_wikidata import Wikidata5m

import torch
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T

from sklearn.metrics import roc_auc_score

In [2]:
transform = T.Compose([
    T.NormalizeFeatures(),
    T.RandomLinkSplit(num_val=0, num_test=0.001, is_undirected=True, add_negative_train_samples=False),
])

dataset = Wikidata5m("datasets/", transform=transform)

train_data, _, test_data = dataset[0]

Started download


Downloading https://www.dropbox.com/s/6sbhm0rwo4l73jq/wikidata5m_transductive.tar.gz?dl=1
Downloading https://www.dropbox.com/s/7jp4ib8zo3i6m10/wikidata5m_text.txt.gz?dl=1
Downloading https://www.dropbox.com/s/lnbhc8yuhit4wm5/wikidata5m_alias.tar.gz?dl=1


Unpacking
Finished download


Processing...
Done!


In [13]:
train_data, test_data

(Data(x=[4796490, 1], edge_index=[2, 5900870], edge_label=[2950435], edge_label_index=[2, 2950435]),
 Data(x=[4796490, 1], edge_index=[2, 5900870], edge_label=[5906], edge_label_index=[2, 5906]))

### Loaders

In [24]:
from torch_geometric.loader import LinkNeighborLoader

batch_size = 512
n_neighbors = [10] * 2

train_loader = LinkNeighborLoader(
    train_data,
    num_neighbors=n_neighbors,
    batch_size=batch_size,
    edge_label_index=train_data.edge_label_index,
    neg_sampling_ratio=1.0
)

test_loader = LinkNeighborLoader(
    test_data,
    num_neighbors=n_neighbors,
    batch_size=batch_size * 2,
    edge_label_index=test_data.edge_label_index,
    neg_sampling_ratio=1.0
)

## Network

In [15]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

In [16]:
device = 'cpu'

model = Net(train_data.num_features, 512, 512).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [17]:
next(iter(train_loader))

Data(x=[26605, 1], edge_index=[2, 37633], edge_label=[1024], edge_label_index=[2, 1024])

In [18]:
from tqdm import tqdm

def train(loader: LinkNeighborLoader):
    model.train()
    n_iters = 10 #int(train_data.edge_index.size(-1) / batch_size)
    for i in tqdm(range(n_iters)):
        optimizer.zero_grad()
        batch = next(iter(loader))

        z = model.encode(batch.x, batch.edge_index)
        out = model.decode(z, batch.edge_label_index).view(-1)
        loss = criterion(out, batch.edge_label)
        loss.backward()
        optimizer.step()
    return loss

In [19]:
@torch.no_grad()
def test(dataloader):
    model.eval()
    aucs = 0
    n_iters = 10 #int(train_data.edge_index.size(-1) / batch_size)
    for i in tqdm(range(n_iters)):
        batch = next(iter(dataloader))
        z = model.encode(batch.x, batch.edge_index)
        out = model.decode(z, batch.edge_label_index).view(-1).sigmoid()
        aucs += roc_auc_score(batch.edge_label.cpu().numpy(), out.cpu().numpy())
    return aucs / n_iters

In [26]:
avg_test_acc = 0

for epoch in range(1, 3):
    loss = train(train_loader)
    acc = test(test_loader)
    avg_test_acc += acc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test: {avg_test_acc/epoch:.4f}')

100%|██████████| 10/10 [00:08<00:00,  1.13it/s]
100%|██████████| 10/10 [00:07<00:00,  1.31it/s]


Epoch: 001, Loss: 0.5778, Test: 0.7619


100%|██████████| 10/10 [00:09<00:00,  1.09it/s]
100%|██████████| 10/10 [00:08<00:00,  1.24it/s]

Epoch: 002, Loss: 0.5552, Test: 0.7607



