In [1]:
import argparse
import time
import os, sys
import os.path as osp
from shutil import copy
import copy as cp
from tqdm import tqdm
import pdb

import numpy as np
from sklearn.metrics import roc_auc_score
import scipy.sparse as ssp
import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader

from torch_sparse import coalesce
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.data import Data, Dataset, InMemoryDataset, DataLoader
from torch_geometric.utils import to_networkx, to_undirected

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator

from helpers import SEALDataset

from utils import *
from models import *

In [2]:
dataset = PygLinkPropPredDataset(name='ogbl-collab')
# splits the dataset into train, valid, test
split_edge = dataset.get_edge_split()
data = dataset[0]

In [7]:
dataset.root

'dataset/ogbl_collab'

In [33]:
# the indices of the pos & neg val/test edges
pos_val_edge, neg_val_edge = get_pos_neg_edges('valid', split_edge, 
                                                   data.edge_index, 
                                                   data.num_nodes)
pos_test_edge, neg_test_edge = get_pos_neg_edges('test', split_edge, 
                                                     data.edge_index, 
                                                     data.num_nodes)

In [36]:
split_edge

{'train': {'edge': tensor([[150989, 224881],
          [150989, 224881],
          [180078, 199043],
          ...,
          [ 60425, 221741],
          [135758, 221741],
          [140614, 207232]]),
  'weight': tensor([2, 1, 1,  ..., 1, 1, 1]),
  'year': tensor([2004, 2002, 2015,  ..., 2006, 2006, 1984])},
 'valid': {'edge': tensor([[ 49077, 199043],
          [148278, 147915],
          [100860, 136989],
          ...,
          [145209,  15870],
          [ 32751,  15870],
          [ 32552,  15870]]),
  'weight': tensor([1, 1, 2,  ..., 1, 1, 1]),
  'year': tensor([2018, 2018, 2018,  ..., 2018, 2018, 2018]),
  'edge_neg': tensor([[ 38605,  43566],
          [213250, 225425],
          [132174, 208876],
          ...,
          [ 54891, 229399],
          [ 19173, 212241],
          [162412, 204275]])},
 'test': {'edge': tensor([[112509, 220958],
          [  2599, 186854],
          [100860, 136989],
          ...,
          [ 47058, 190305],
          [216257, 190305],
          

In [11]:
torch.transpose(pos_val_edge, 0, 1)

tensor([[207097, 101712],
        [ 22267,  68958],
        [ 80185, 201524],
        ...,
        [ 14221, 143446],
        [196117, 122040],
        [ 12604, 231049]])

In [4]:
# constants
PATH = dataset.root + '_seal{}'.format('')
NUM_HOPS = 1
VAL_PERCENT = 100
USE_COALESCE = True
NODE_LABEL = 'drnl'
RATIO_PER_HOP = 1.0
MAX_PER_HOP = None
DIRECTED = False

In [5]:
NUM_WORKERS = 16
BATCH_SIZE = 32

In [6]:
val_dataset = eval('SEALDataset')(
    PATH, 
    data, 
    split_edge, 
    num_hops=NUM_HOPS, 
    percent=VAL_PERCENT, 
    split='valid', 
    use_coalesce= USE_COALESCE, 
    node_label= NODE_LABEL, 
    ratio_per_hop=RATIO_PER_HOP, 
    max_nodes_per_hop=MAX_PER_HOP, 
    directed=DIRECTED, 
)

In [53]:
len(val_loader)

5003

In [13]:
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

In [14]:
for (i, dataset) in enumerate(val_loader):
    if i != 5000: continue
    print(i)
    print(dataset)
    print(dataset.num_graphs)
    print(dataset.edge_index)
    print(dataset.z)
    print(dataset.y)
    print(dataset.batch)
    break

KeyboardInterrupt: 

In [15]:
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'

In [16]:
model = DGCNN(32, 4, 1000, 0.6, val_dataset, True, use_feature=True, 
                      node_embedding=None).to(device)

In [39]:
model.eval()

y_pred, y_true = [], []
counter = 0
for data in tqdm(val_loader, ncols=70):
    counter += 1
    data = data.to(device)
    x = data.x 
    edge_weight = data.edge_weight
    node_id = None
    logits = model(data.z, data.edge_index, data.batch, x, edge_weight, node_id)
    y_pred.append(logits.view(-1).cpu())
    y_true.append(data.y.view(-1).cpu().to(torch.float))
#     if counter == 756 : break
val_pred, val_true = torch.cat(y_pred), torch.cat(y_true)
pos_val_pred = val_pred[val_true==1]
neg_val_pred = val_pred[val_true==0]

  0%|                                | 6/5003 [00:02<37:12,  2.24it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.76 GiB total capacity; 9.83 GiB already allocated; 20.56 MiB free; 9.88 GiB reserved in total by PyTorch)

In [31]:
pos_val_edge = split_edge['valid']['edge']

tensor([[207097,  22267,  80185,  ...,  14221, 196117,  12604],
        [101712,  68958, 201524,  ..., 143446, 122040, 231049]])

In [32]:
pos_val_edge

tensor([[ 49077, 199043],
        [148278, 147915],
        [100860, 136989],
        ...,
        [145209,  15870],
        [ 32751,  15870],
        [ 32552,  15870]])

In [42]:
pos_test_edge = split_edge['test']['edge']
neg_test_edge = split_edge['test']['edge_neg']

pos_val_edge = split_edge['valid']['edge']
neg_val_edge = split_edge['valid']['edge_neg']

In [41]:
len(pos_val_pred)

24192

In [46]:
pos_val_edge

tensor([[ 49077, 199043],
        [148278, 147915],
        [100860, 136989],
        ...,
        [145209,  15870],
        [ 32751,  15870],
        [ 32552,  15870]])

In [19]:
pos_val_pred.unsqueeze(1)

tensor([[-0.0876],
        [-0.0886],
        [-0.0951],
        ...,
        [-0.0923],
        [-0.0970],
        [-0.0893]], grad_fn=<UnsqueezeBackward0>)

In [26]:
pos_val_edge[:len(pos_val_pred),:]

tensor([[207097,  22267,  80185,  ...,  14221, 196117,  12604],
        [101712,  68958, 201524,  ..., 143446, 122040, 231049]])

In [44]:
torch.cat((pos_val_edge[:len(pos_val_pred),:], pos_val_pred.unsqueeze(1)), dim=1)

tensor([[ 4.9077e+04,  1.9904e+05, -8.7603e-02],
        [ 1.4828e+05,  1.4792e+05, -8.8639e-02],
        [ 1.0086e+05,  1.3699e+05, -9.5121e-02],
        ...,
        [ 1.6877e+05,  7.6310e+04, -9.2254e-02],
        [ 1.6335e+04,  7.6310e+04, -9.6994e-02],
        [ 3.7801e+04,  1.2516e+05, -8.9277e-02]], grad_fn=<CatBackward>)

In [26]:
pos_val_edge[:len(pos_val_pred),:]

tensor([[ 49077, 199043],
        [148278, 147915],
        [100860, 136989],
        ...,
        [168770,  76310],
        [ 16335,  76310],
        [ 37801, 125162]])

In [27]:
pos_val_pred

tensor([0.0323, 0.0302, 0.0439,  ..., 0.0299, 0.0282, 0.0345],
       grad_fn=<IndexBackward>)

In [50]:
neg_val_pred

tensor([], grad_fn=<IndexBackward>)

In [None]:
test_dataset = eval('SEALDataset')(
    PATH, 
    data, 
    split_edge, 
    num_hops=NUM_HOPS, 
    percent=VAL_PERCENT, 
    split='test', 
    use_coalesce= USE_COALESCE, 
    node_label= NODE_LABEL, 
    ratio_per_hop=RATIO_PER_HOP, 
    max_nodes_per_hop=MAX_PER_HOP, 
    directed=DIRECTED, 
)

In [3]:
torch.load('./results/ogbl-collab_20210111184253/run1_model_checkpoint1_val_pred.pt')

tensor([[ 4.9077e+04,  1.9904e+05,  9.7513e+00],
        [ 1.4828e+05,  1.4792e+05,  1.7606e+01],
        [ 1.0086e+05,  1.3699e+05,  1.1062e+01],
        ...,
        [ 5.4891e+04,  2.2940e+05, -6.3907e+00],
        [ 1.9173e+04,  2.1224e+05, -7.0197e+00],
        [ 1.6241e+05,  2.0428e+05, -7.4761e+00]])