In [1]:
import numpy as np

from torch_wikidata import Wikidata5m

import torch
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T

from sklearn.metrics import roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = T.Compose([
    T.NormalizeFeatures(),
    T.RandomLinkSplit(num_val=0, num_test=0.01, is_undirected=True, add_negative_train_samples=False),
])

dataset = Wikidata5m("datasets/", transform=transform)

train_data, _, test_data = dataset[0]

In [3]:
train_data, test_data

(Data(x=[4796490, 1], edge_index=[2, 5847710], edge_label=[2923855], edge_label_index=[2, 2923855]),
 Data(x=[4796490, 1], edge_index=[2, 5847710], edge_label=[59066], edge_label_index=[2, 59066]))

## Load embeddings and transform them to torch matrix

In [4]:
features = np.load("sentence_features.npy")

In [5]:
X = np.ones((train_data.x.shape[0], features.shape[1]))

In [6]:
X[:features.shape[0], :] = features

In [7]:
torch_features = torch.from_numpy(X)

In [8]:
torch_features = torch_features.to(torch.float)

In [9]:
torch_features.dtype

torch.float32

In [10]:
train_data.x = torch_features
test_data.x = torch_features

### Loaders

In [11]:
from torch_geometric.loader import LinkNeighborLoader

batch_size = 512
n_neighbors = [10] * 2

def make_link_loaders(train_data, test_data) -> [LinkNeighborLoader, LinkNeighborLoader]:
    train_loader = LinkNeighborLoader(
        train_data,
        num_neighbors=n_neighbors,
        batch_size=batch_size,
        edge_label_index=train_data.edge_label_index,
        neg_sampling_ratio=1.0,
        shuffle=True
    )

    test_loader = LinkNeighborLoader(
        test_data,
        num_neighbors=n_neighbors,
        batch_size=batch_size,
        edge_label_index=test_data.edge_label_index,
        neg_sampling_ratio=1.0,
        shuffle=True
    )
    return train_loader, test_loader

train_loader, test_loader = make_link_loaders(train_data, test_data)

## Network

In [12]:
class GCNNet(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

In [13]:
model = GCNNet(train_data.num_features, 256, 256).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

## Model training

In [14]:
from tqdm import tqdm

def train(loader: LinkNeighborLoader):
    model.train()
    loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()
        batch.to(device)

        z = model.encode(batch.x, batch.edge_index)
        out = model.decode(z, batch.edge_label_index).view(-1)
        loss = criterion(out, batch.edge_label)
        loss.backward()
        optimizer.step()
    return loss

In [15]:
@torch.no_grad()
def test(loader):
    model.eval()
    aucs = 0
    for batch in tqdm(loader):
        batch.to(device)
        
        z = model.encode(batch.x, batch.edge_index)
        out = model.decode(z, batch.edge_label_index).view(-1).sigmoid()
        aucs += roc_auc_score(batch.edge_label.cpu().numpy(), out.cpu().numpy())
    return aucs / len(loader)

In [16]:
test_accs = []
losses = []

avg_test_auc = 0
total_epochs = 1

In [17]:
for epoch in range(1, 6):
    loss = train(train_loader)
    auc = test(test_loader)
    avg_test_auc += auc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test AUC: {avg_test_auc/total_epochs:.4f}')
    test_accs.append(auc)
    losses.append(loss)
    total_epochs += 1

100%|██████████| 5711/5711 [07:57<00:00, 11.97it/s]
100%|██████████| 116/116 [00:06<00:00, 17.31it/s]


Epoch: 001, Loss: 0.4291, Test AUC: 0.6488


100%|██████████| 5711/5711 [08:02<00:00, 11.83it/s]
100%|██████████| 116/116 [00:06<00:00, 17.50it/s]


Epoch: 002, Loss: 0.4418, Test AUC: 0.6484


100%|██████████| 5711/5711 [08:04<00:00, 11.78it/s]
100%|██████████| 116/116 [00:06<00:00, 17.36it/s]


Epoch: 003, Loss: 0.4358, Test AUC: 0.6486


100%|██████████| 5711/5711 [08:07<00:00, 11.72it/s]
100%|██████████| 116/116 [00:06<00:00, 17.41it/s]


Epoch: 004, Loss: 0.4387, Test AUC: 0.6491


100%|██████████| 5711/5711 [08:07<00:00, 11.71it/s]
100%|██████████| 116/116 [00:06<00:00, 17.41it/s]

Epoch: 005, Loss: 0.4559, Test AUC: 0.6487





In [18]:
import plotly.express as px

In [23]:
losses

[array(0.42912337, dtype=float32),
 array(0.44178107, dtype=float32),
 array(0.43575042, dtype=float32),
 array(0.43867433, dtype=float32),
 array(0.45592526, dtype=float32)]

In [25]:
losses = [loss for loss in losses]
test_accs = [acc for acc in test_accs]

In [26]:
fig = px.line(x=range(len(losses)), y=losses, title="Loss during training")
fig.update_layout(xaxis_title="Epochs", yaxis_title="Loss")
fig.write_image("images/loss_fig_embeddings.png")
fig.show()

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


In [None]:
fig = px.line(x=range(len(test_accs)), y=test_accs, title="Test ROC AUC score")
fig.update_layout(xaxis_title="Epochs", yaxis_title="ROC AUC score")
fig.write_image("images/test_fig_embeddings.png")
fig.show()