In [None]:
# Paper: Link Prediction Based on Graph Neural Networks (NeurIPS 2018)
# Example: https://github.com/rusty1s/pytorch_geometric/blob/99a496e077a4d41417c7d927df7730fd984004b9/examples/seal_link_pred.py#L90

In [36]:
import math
import random
import os.path as osp
from itertools import chain

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime

from itertools import repeat, product
from torch import Tensor

import numpy as np
from sklearn.metrics import roc_auc_score
from scipy.sparse.csgraph import shortest_path
import torch
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss
from torch.nn import ModuleList, Linear, Conv1d, MaxPool1d
from torch.nn.parameter import Parameter

from torch.utils.data import DataLoader

import torch_geometric
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv, global_sort_pool
from torch_geometric.data import Data, InMemoryDataset, DataLoader, Dataset
from torch_geometric.utils import (negative_sampling, add_self_loops,
                                   train_test_split_edges, k_hop_subgraph,
                                   to_scipy_sparse_matrix, to_undirected, 
                                   contains_self_loops, add_remaining_self_loops, 
                                   is_undirected, segregate_self_loops)

In [25]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

# Define

In [55]:
def to_list(x):
    if not isinstance(x, (tuple, list)):
        x = [x]
    return x


def files_exist(files):
    return len(files) != 0 and all(osp.exists(f) for f in files)

In [261]:
class MyDataset(InMemoryDataset):
    
    def __init__(self, num_hops, root=None, split="train"):
        
        self.num_hops = num_hops
        super().__init__(root=root)
        
        index = ['train', 'valid', "test"].index(split)
        self.data, self.slices = torch.load(self.processed_paths[index])

        
    @property
    def raw_file_names(self):  
        return ["train.csv"]
    
    
    @property
    def processed_file_names(self):
        return ['SEAL_data_train.pt', 'SEAL_data_valid.pt', 'SEAL_data_test.pt']
    
    
    def process(self):
        # Read node features
        content = pd.read_csv(os.path.join(self.raw_dir, "content.csv"), delimiter="\t", header=None)
        content = content.sort_values(by=[0]).loc[:, 1:].to_numpy()
        content = torch.from_numpy(content)
        num_nodes = content.size(0)
    
        
        # Read train edge list
        train = pd.read_csv(os.path.join(self.raw_dir, "train.csv"))
        
        train_pos = train[ train["label"] == 1]
        train_neg = train[ train["label"] == 0]

        follower_pos = train_pos["from"].to_numpy().tolist()
        followee_pos = train_pos["to"].to_numpy().tolist()
        train_pos_edge = torch.tensor([follower_pos, followee_pos], dtype=torch.long)
        train_pos_edge, _ = torch_geometric.utils.remove_self_loops(train_pos_edge)
        train_pos_edge = to_undirected(train_pos_edge, num_nodes=num_nodes)

        self.data = Data(x=content, edge_index=train_pos_edge, num_nodes=num_nodes)
        self.__max_z__ = 0
        

        follower_neg = train_neg["from"].to_numpy().tolist()
        followee_neg = train_neg["to"].to_numpy().tolist()
        train_neg_edge = torch.tensor([follower_neg, followee_neg], dtype=torch.long)
        train_neg_edge = to_undirected(train_neg_edge, num_nodes=num_nodes)
        
        ### Suppose self-loop must be linked
        train_neg_edge, _ = torch_geometric.utils.remove_self_loops(train_neg_edge)
    
#         train_pos_edge = train_pos_edge.t()
#         train_neg_edge = train_neg_edge.t()

#         train_pos_edge, valid_pos_edge = train_test_split(train_pos_edge, shuffle=True)
#         train_neg_edge, valid_neg_edge = train_test_split(train_neg_edge, shuffle=True)
        
#         train_pos_edge = train_pos_edge.t()
#         valid_pos_edge = valid_pos_edge.t()
#         train_neg_edge = train_neg_edge.t()
#         valid_neg_edge = valid_neg_edge.t()
        
        ### DRNL can't handle self-loops
        # train_pos_edge, _ = add_self_loops(train_pos_edge, num_nodes=num_nodes)
        
        
        train_pos_list = self.extract_enclosing_subgraphs(
            train_pos_edge, train_pos_edge, 1, num_nodes=num_nodes)
        train_neg_list = self.extract_enclosing_subgraphs(
            train_neg_edge, train_pos_edge, 0, num_nodes=num_nodes)
    

#         val_pos_list = self.extract_enclosing_subgraphs(
#             valid_pos_edge, train_pos_edge, 1, num_nodes=num_nodes)
#         val_neg_list = self.extract_enclosing_subgraphs(
#             valid_neg_edge, train_pos_edge, 0, num_nodes=num_nodes)
        
        
        # Read test edge list
        test = pd.read_csv(os.path.join(self.raw_dir, "test.csv"))

        test_follower = test["from"].to_numpy().tolist()
        test_followee = test["to"].to_numpy().tolist()
        test_edge = torch.tensor([test_follower, test_followee], dtype=torch.long)
        test_edge, _ = torch_geometric.utils.remove_self_loops(test_edge)
        
        test_list = self.extract_enclosing_subgraphs(
            test_edge, train_pos_edge, 0, num_nodes=num_nodes)


        # Convert labels to one-hot features.
        for data in chain(train_pos_list, train_neg_list, 
#                           val_pos_list, val_neg_list, 
                          test_list):

            z = F.one_hot(data.z, self.__max_z__ + 1).to(torch.float)
            data.x = torch.cat([z, data.x], 1)

        torch.save(self.collate(train_pos_list + train_neg_list),
                   self.processed_paths[0])
#         torch.save(self.collate(val_pos_list + val_neg_list),
#                    self.processed_paths[1])
        torch.save(self.collate(test_list), self.processed_paths[2])

    
    
    def extract_enclosing_subgraphs(self, link_index, edge_index, y, num_nodes, scale="single"):
        data_list = []
        for src, dst in link_index.t().tolist():
            sub_nodes, sub_edge_index, mapping, _ = k_hop_subgraph(
                [src, dst], self.num_hops, edge_index, relabel_nodes=True, num_nodes=num_nodes)
            src, dst = mapping.tolist()
            
            # Remove target link from the subgraph.
            mask1 = (sub_edge_index[0] != src) | (sub_edge_index[1] != dst)
            mask2 = (sub_edge_index[0] != dst) | (sub_edge_index[1] != src)
            sub_edge_index = sub_edge_index[:, mask1 & mask2]

            # Calculate node labeling.
            z = self.drnl_node_labeling(sub_edge_index, src, dst,
                                        num_nodes=sub_nodes.size(0))

            data = Data(x=self.data.x[sub_nodes], z=z,
                        edge_index=sub_edge_index, y=y)
            data_list.append(data)
 
        return data_list
    

    def drnl_node_labeling(self, edge_index, src, dst, num_nodes=None):
        # Double-radius node labeling (DRNL).
        src, dst = (dst, src) if src > dst else (src, dst)
        adj = to_scipy_sparse_matrix(edge_index, num_nodes=num_nodes).tocsr()
        
        idx = list(range(src)) + list(range(src + 1, adj.shape[0]))
        adj_wo_src = adj[idx, :][:, idx]
        
        idx = list(range(dst)) + list(range(dst + 1, adj.shape[0]))
        adj_wo_dst = adj[idx, :][:, idx]

        dist2src = shortest_path(adj_wo_dst, directed=False, unweighted=True,
                                 indices=src)
        dist2src = np.insert(dist2src, dst, 0, axis=0)
        dist2src = torch.from_numpy(dist2src)


        dist2dst = shortest_path(adj_wo_src, directed=False, unweighted=True,
                                 indices=dst - 1)
        dist2dst = np.insert(dist2dst, src, 0, axis=0)
        dist2dst = torch.from_numpy(dist2dst)

        dist = dist2src + dist2dst
        dist_over_2, dist_mod_2 = dist // 2, dist % 2

        z = 1 + torch.min(dist2src, dist2dst)
        z += dist_over_2 * (dist_over_2 + dist_mod_2 - 1)
        z[src] = 1.
        z[dst] = 1.
        z[torch.isnan(z)] = 0.

        self.__max_z__ = max(int(z.max()), self.__max_z__)

            
        return z.to(torch.long)


# Model

In [58]:
class DGCNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layers, GNN=GCNConv, k=0.6):
        super(DGCNN, self).__init__()

        if k < 1:  # Transform percentile to number.
            num_nodes = sorted([data.num_nodes for data in train_dataset])
            k = num_nodes[int(math.ceil(k * len(num_nodes))) - 1]
            k = max(10, k)
        self.k = int(k)

        self.convs = ModuleList()
        self.convs.append(GNN(train_dataset.num_features, hidden_channels))
        for i in range(0, num_layers - 1):
            self.convs.append(GNN(hidden_channels, hidden_channels))
        self.convs.append(GNN(hidden_channels, 1))

        conv1d_channels = [16, 32]
        total_latent_dim = hidden_channels * num_layers + 1
        conv1d_kws = [total_latent_dim, 5]
        self.conv1 = Conv1d(1, conv1d_channels[0], conv1d_kws[0],
                            conv1d_kws[0])
        self.maxpool1d = MaxPool1d(2, 2)
        self.conv2 = Conv1d(conv1d_channels[0], conv1d_channels[1],
                            conv1d_kws[1], 1)
        dense_dim = int((self.k - 2) / 2 + 1)
        dense_dim = (dense_dim - conv1d_kws[1] + 1) * conv1d_channels[1]
        self.lin1 = Linear(dense_dim, 128)
        self.lin2 = Linear(128, 1)

    def forward(self, x, edge_index, batch):
        xs = [x]
        for conv in self.convs:
            xs += [torch.tanh(conv(xs[-1], edge_index))]
        x = torch.cat(xs[1:], dim=-1)

        # Global pooling.

        x = global_sort_pool(x, batch, self.k)

        x = x.unsqueeze(1)  # [num_graphs, 1, k * hidden]
        x = F.relu(self.conv1(x))
        x = self.maxpool1d(x)

        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # [num_graphs, dense_dim]

        
        # MLP.
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)

        return x


In [88]:
def train(loader):
    model.train()

    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        logits = model(data.x, data.edge_index, data.batch)
        loss = BCEWithLogitsLoss()(logits.view(-1), data.y.to(torch.float))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs

    return total_loss / len(train_dataset)

In [89]:
@torch.no_grad()
def test(loader):
    model.eval()

    y_pred, y_true = [], []
    for data in loader:
        data = data.to(device)
        logits = model(data.x, data.edge_index, data.batch)
    
        pred = torch.nn.Sigmoid()(logits)
        
        y_pred.append(pred.view(-1).cpu())
        y_true.append(data.y.view(-1).cpu().to(torch.float))

    return roc_auc_score(torch.cat(y_true), torch.cat(y_pred))

In [90]:
@torch.no_grad()
def predict(loader):
    model.eval()
    
    y_pred = []
    for data in loader:
        data = data.to(device)
        logits = model(data.x, data.edge_index, data.batch)
        
        pred = torch.nn.Sigmoid()(logits)
        
        y_pred.append(pred.view(-1).cpu().numpy())

    return y_pred

# Usage

In [25]:
from torch_geometric.datasets import Planetoid

In [None]:
cora = Planetoid(root=os.getcwd(), name='Cora')

cora_train_dataset = SEALDataset(cora, num_hops=2, split='train')
cora_val_dataset = SEALDataset(cora, num_hops=2, split='val')
cora_test_dataset = SEALDataset(cora, num_hops=2, split='test')

cora_train_loader = DataLoader(cora_train_dataset, batch_size=32, shuffle=True)
cora_val_loader = DataLoader(cora_val_dataset, batch_size=32)
cora_test_loader = DataLoader(cora_test_dataset, batch_size=32)

In [12]:
cora_train_dataset

SEALDataset(17952)

In [94]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model = DGCNN(hidden_channels=32, num_layers=3).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)

In [None]:
best_val_auc = test_auc = 0
for epoch in range(1, 51):
    loss = train(cora_train_loader)
    val_auc = test(cora_val_loader)
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        test_auc = test(cora_test_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

# My Usage

In [254]:
def load_checkpoint(filepath, device):
    
    model = DGCNN(hidden_channels=32, num_layers=3).to(device)

    if os.path.exists(filepath):
        print("pretrained finded")
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_stat'])
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        optimizer.load_state_dict(checkpoint['optimizer_stat'])

    else:
        print("use a new optimizer")
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    return model, optimizer

In [255]:
root = os.getcwd()

In [256]:
dataset = "dataset4"

In [257]:
date_time = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%M")

save = os.path.join(root, "results", dataset, date_time)

if os.path.exists(save):
    pass
else:
    os.makedirs(save)

In [262]:
train_dataset =  MyDataset(num_hops=2, root=os.path.join(os.getcwd(), "hw2_data", dataset), split="train")
# val_dataset =  MyDataset(num_hops=2, root=os.path.join(os.getcwd(), "hw2_data", dataset), split="valid")
test_dataset = MyDataset(num_hops=2, root=os.path.join(os.getcwd(), "hw2_data", dataset), split="test")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

Processing...
torch.Size([2, 131]) torch.Size([67]) torch.Size([67])
torch.Size([2, 114]) torch.Size([63]) torch.Size([63])
torch.Size([2, 47]) torch.Size([18]) torch.Size([18])
torch.Size([2, 295]) torch.Size([118]) torch.Size([118])
torch.Size([2, 44]) torch.Size([19]) torch.Size([19])
torch.Size([2, 141]) torch.Size([68]) torch.Size([68])
torch.Size([2, 4]) torch.Size([4]) torch.Size([4])
torch.Size([2, 25]) torch.Size([11]) torch.Size([11])
torch.Size([2, 138]) torch.Size([69]) torch.Size([69])
torch.Size([2, 12]) torch.Size([8]) torch.Size([8])
torch.Size([2, 20]) torch.Size([11]) torch.Size([11])
torch.Size([2, 253]) torch.Size([105]) torch.Size([105])
torch.Size([2, 308]) torch.Size([122]) torch.Size([122])
torch.Size([2, 34]) torch.Size([17]) torch.Size([17])
torch.Size([2, 321]) torch.Size([132]) torch.Size([132])
torch.Size([2, 205]) torch.Size([87]) torch.Size([87])
torch.Size([2, 39]) torch.Size([16]) torch.Size([16])
torch.Size([2, 231]) torch.Size([99]) torch.Size([99])
t

torch.Size([2, 185]) torch.Size([88]) torch.Size([88])
torch.Size([2, 14]) torch.Size([7]) torch.Size([7])
torch.Size([2, 33]) torch.Size([18]) torch.Size([18])
torch.Size([2, 50]) torch.Size([21]) torch.Size([21])
torch.Size([2, 165]) torch.Size([79]) torch.Size([79])
torch.Size([2, 252]) torch.Size([105]) torch.Size([105])
torch.Size([2, 165]) torch.Size([76]) torch.Size([76])
torch.Size([2, 56]) torch.Size([22]) torch.Size([22])
torch.Size([2, 183]) torch.Size([88]) torch.Size([88])
torch.Size([2, 66]) torch.Size([29]) torch.Size([29])
torch.Size([2, 197]) torch.Size([86]) torch.Size([86])
torch.Size([2, 11]) torch.Size([8]) torch.Size([8])
torch.Size([2, 185]) torch.Size([88]) torch.Size([88])
torch.Size([2, 11]) torch.Size([8]) torch.Size([8])
torch.Size([2, 135]) torch.Size([71]) torch.Size([71])
torch.Size([2, 22]) torch.Size([11]) torch.Size([11])
torch.Size([2, 207]) torch.Size([89]) torch.Size([89])
torch.Size([2, 321]) torch.Size([132]) torch.Size([132])
torch.Size([2, 24]) 

KeyboardInterrupt: 

In [238]:
train_dataset

MyDataset(11106)

In [239]:
train_dataset[0]

Data(edge_index=[2, 53], x=[24, 3749], y=[1], z=[24])

In [240]:
test_dataset

MyDataset(1868)

In [241]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

model, optimizer = load_checkpoint(os.path.join(save, "{}.pth".format(date_time)), device)

use a new optimizer


In [242]:
best_val_auc = test_auc = 0
for epoch in range(1, 101):
    loss = train(train_loader)
    val_auc = test(val_loader)
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        

        checkpoint = {
            'model_stat': model.state_dict(),
            'optimizer_stat': optimizer.state_dict(),
        }
        
        torch.save(checkpoint, os.path.join(save, 
                                            "{}.pth".format(date_time)))
        print("\nSave Model")
        
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_auc:.4f}')


Save Model
Epoch: 01, Loss: 0.6766, Val: 0.7851

Save Model
Epoch: 02, Loss: 0.5205, Val: 0.8344

Save Model
Epoch: 03, Loss: 0.4566, Val: 0.8615

Save Model
Epoch: 04, Loss: 0.3857, Val: 0.8800

Save Model
Epoch: 05, Loss: 0.3166, Val: 0.8926

Save Model
Epoch: 06, Loss: 0.2695, Val: 0.9021

Save Model
Epoch: 07, Loss: 0.2257, Val: 0.9063

Save Model
Epoch: 08, Loss: 0.2010, Val: 0.9121

Save Model
Epoch: 09, Loss: 0.1841, Val: 0.9172

Save Model
Epoch: 10, Loss: 0.1692, Val: 0.9199

Save Model
Epoch: 11, Loss: 0.1567, Val: 0.9206

Save Model
Epoch: 12, Loss: 0.1508, Val: 0.9256

Save Model
Epoch: 13, Loss: 0.1385, Val: 0.9285

Save Model
Epoch: 14, Loss: 0.1300, Val: 0.9308
Epoch: 15, Loss: 0.1193, Val: 0.9302

Save Model
Epoch: 16, Loss: 0.1157, Val: 0.9342
Epoch: 17, Loss: 0.1096, Val: 0.9338
Epoch: 18, Loss: 0.1020, Val: 0.9328
Epoch: 19, Loss: 0.0998, Val: 0.9336

Save Model
Epoch: 20, Loss: 0.0927, Val: 0.9362
Epoch: 21, Loss: 0.0967, Val: 0.9356
Epoch: 22, Loss: 0.0883, Val: 0

In [243]:
model, optimizer = load_checkpoint(os.path.join(save, "{}.pth".format(date_time)), device)

pretrained finded


In [244]:
test_pred = pd.read_csv(os.path.join(os.getcwd(), "hw2_data", dataset, "raw", "test.csv"))

test_pred["prob"] = np.nan
test_pred.loc[(test_pred["from"] == test_pred["to"]), ["prob"]] = 1

In [245]:
pred = predict(test_loader)
pred = np.concatenate(pred)
pred = np.round(pred,3)
pred

array([1.   , 1.   , 1.   , ..., 0.   , 0.999, 1.   ], dtype=float32)

In [246]:
i = 0
for row in range(start, len(test_pred)):
    if np.isnan(test_pred.at[row, "prob"]):
        test_pred.at[row, "prob"] = pred[i]
        i+=1

In [247]:
upload = test_pred[["id", "prob"]]
upload

Unnamed: 0,id,prob
0,E3064,1.000
1,E298,1.000
2,E3512,1.000
3,E5670,1.000
4,E5005,0.066
...,...,...
1881,E9179,0.015
1882,E5003,1.000
1883,E5081,0.000
1884,E4705,0.999


In [248]:
test_pred.to_csv(os.path.join(save, "{}.csv".format(date_time)), index=False)

In [249]:
upload.to_csv(os.path.join(save, "upload.csv"), index=False)