In [57]:
# pyg-team
# from: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/link_pred.py

In [58]:
import os.path as ospath
import numpy as np
import pandas as pd

import torch
from torch.nn import Linear
from sklearn.metrics import roc_auc_score
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
from torch_geometric.data import Data, Dataset
import torch_geometric.transforms as Tr

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## preprocessing

In [59]:
prefix1 = 'dataset1'
prefix2 = 'dataset2'
prefix3 = 'dataset3'

df_train1 = pd.read_csv(f'{prefix1}/train.csv')
df_test1 = pd.read_csv(f'{prefix1}/test.csv')
df_content1 = pd.read_csv(f'{prefix1}/content.csv', sep='\t', header=None, index_col=0)

df_train2 = pd.read_csv(f'{prefix2}/train.csv')
df_test2 = pd.read_csv(f'{prefix2}/test.csv')
df_content2 = pd.read_csv(f'{prefix2}/content.csv', sep='\t', header=None, index_col=0)

df_train3 = pd.read_csv(f'{prefix3}/train.csv')
df_test3 = pd.read_csv(f'{prefix3}/test.csv')
df_content3 = pd.read_csv(f'{prefix3}/content.csv', sep='\t', header=None, index_col=0)

df_content1 = df_content1.sort_index()
df_content2 = df_content2.sort_index()
df_content3 = df_content3.sort_index()

In [60]:
def print_shape(df_train, df_test, df_content):
    print(f"{'Train Shape':<15}: {df_train.shape}")
    print(f"{'Test Shape':<15}: {df_test.shape}")
    print(f"{'Content Shape':<15}: {df_content.shape}")

print_shape(df_train1, df_test1, df_content1)

Train Shape    : (8686, 4)
Test Shape     : (2172, 3)
Content Shape  : (2708, 1433)


In [61]:
def df2data(df_train, df_test, df_content):
    X = torch.tensor(df_content.values, dtype=torch.float32)
    train_edge_index = torch.tensor(df_train.iloc[:, 1:3].values).T
    train_edge_label_index = torch.tensor(df_train.loc[(df_train['label'] == 1), ['from', 'to']].values).T
    train_edge_label = torch.ones(size=(train_edge_label_index.shape[1], ))

    test_edge_label_index = torch.tensor(df_test.iloc[:, 1:3].values).T
    return (Data(x=X, edge_index=train_edge_index, edge_label_index=train_edge_label_index, edge_label=train_edge_label), 
            Data(x=X, edge_index=train_edge_index, edge_label_index=test_edge_label_index))

In [62]:
train_data1, test_data1 = df2data(df_train1, df_test1, df_content1)
train_data2, test_data2 = df2data(df_train2, df_test2, df_content2)
train_data3, test_data3 = df2data(df_train3, df_test3, df_content3)

# Tr = from torch_geometric
transform = Tr.Compose([
    Tr.NormalizeFeatures(),
    Tr.ToDevice(device),
    # Tr.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
    #                    add_negative_train_samples=False)
])
train_data1, test_data1 = transform(train_data1), transform(test_data1)
train_data2, test_data2 = transform(train_data2), transform(test_data2)
train_data3, test_data3 = transform(train_data3), transform(test_data3)

In [63]:
[train_data1, test_data1]

[Data(x=[2708, 1433], edge_index=[2, 8686], edge_label_index=[2, 4324], edge_label=[4324]),
 Data(x=[2708, 1433], edge_index=[2, 8686], edge_label_index=[2, 2172])]

In [68]:
class Net(torch.nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        c1_channels = 128
        c2_channels = 256
        lin1_channels = 128
        lin2_channels = 64
        self.conv1 = GCNConv(in_channels,  c1_channels)
        self.conv2 = GCNConv(c1_channels, c2_channels)
        self.lin1 = Linear(c2_channels, lin1_channels)
        self.lin2 = Linear(lin1_channels, lin2_channels)

    def encode(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.conv2(h, edge_index).relu()
        h = self.lin1(h).relu()
        h = self.lin2(h)
        return h

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

def fit(train_data, model, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss

# @torch.no_grad()
# def test(data):
#     model.eval()
#     z = model.encode(data.x, data.edge_index)
#     out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
#     return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

def train_loop(train_data):
    model = Net(train_data.x.shape[1]).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
    criterion = torch.nn.BCEWithLogitsLoss()
    for epoch in range(1, 500):
        loss = fit(train_data, model, optimizer, criterion)
        print(f'Epoch: {epoch:04d}, Loss: {loss:.4f}')
    return model

model1 = train_loop(train_data1)
model2 = train_loop(train_data2)
model3 = train_loop(train_data3)
# z = model.encode(test_data.x, test_data.edge_index)
# final_edge_index = model.decode_all(z)

Epoch: 0001, Loss: 0.6982
Epoch: 0002, Loss: 0.6935
Epoch: 0003, Loss: 0.7006
Epoch: 0004, Loss: 0.6931
Epoch: 0005, Loss: 0.6933
Epoch: 0006, Loss: 0.6935
Epoch: 0007, Loss: 0.6934
Epoch: 0008, Loss: 0.6934
Epoch: 0009, Loss: 0.6933
Epoch: 0010, Loss: 0.6930
Epoch: 0011, Loss: 0.6923
Epoch: 0012, Loss: 0.6915
Epoch: 0013, Loss: 0.6897
Epoch: 0014, Loss: 0.6868
Epoch: 0015, Loss: 0.6814
Epoch: 0016, Loss: 0.6740
Epoch: 0017, Loss: 0.6675
Epoch: 0018, Loss: 0.6594
Epoch: 0019, Loss: 0.6476
Epoch: 0020, Loss: 0.7020
Epoch: 0021, Loss: 0.6789
Epoch: 0022, Loss: 0.6842
Epoch: 0023, Loss: 0.6652
Epoch: 0024, Loss: 0.6477
Epoch: 0025, Loss: 0.6532
Epoch: 0026, Loss: 0.6673
Epoch: 0027, Loss: 0.6668
Epoch: 0028, Loss: 0.6546
Epoch: 0029, Loss: 0.6520
Epoch: 0030, Loss: 0.6522
Epoch: 0031, Loss: 0.6528
Epoch: 0032, Loss: 0.6438
Epoch: 0033, Loss: 0.6371
Epoch: 0034, Loss: 0.6303
Epoch: 0035, Loss: 0.6336
Epoch: 0036, Loss: 0.6304
Epoch: 0037, Loss: 0.6284
Epoch: 0038, Loss: 0.6265
Epoch: 0039,

In [45]:
def predict(test_data, model):
    z = model.encode(test_data.x, test_data.edge_index)
    test_pred = torch.sigmoid(model.decode(z, test_data.edge_label_index))
    return test_pred

In [46]:
test1_pred = predict(test_data1, model1)
test2_pred = predict(test_data2, model2)
test3_pred = predict(test_data3, model3)

In [47]:
pd.DataFrame({
    'id': df_test1['id'],
    'prob': test1_pred.tolist()
}).to_csv('test1_upload.csv', index=False)

pd.DataFrame({
    'id': df_test2['id'],
    'prob': test2_pred.tolist()
}).to_csv('test2_upload.csv', index=False)

pd.DataFrame({
    'id': df_test3['id'],
    'prob': test3_pred.tolist()
}).to_csv('test3_upload.csv', index=False)