In [1]:
import dgl
from dgl.data.utils import load_graphs

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random

from model.model import HTGNN, NodePredictor
from utils.pytorchtools import EarlyStopping
from utils.data import load_COVID_data, load_MAG_data

dgl.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
glist, label_dict = load_graphs('/home/jiazhengli/xdgnn/HTGNN/acm_graph.bin')

In [3]:
glist[]

[Graph(num_nodes={'author': 5912, 'paper': 3025, 'subject': 57},
       num_edges={('author', 'author-paper', 'paper'): 9936, ('paper', 'paper-author', 'author'): 9936, ('paper', 'paper-subject', 'subject'): 3025, ('subject', 'subject-paper', 'paper'): 3025},
       metagraph=[('author', 'paper', 'author-paper'), ('paper', 'author', 'paper-author'), ('paper', 'subject', 'paper-subject'), ('subject', 'paper', 'subject-paper')])]

In [9]:
(0.758 + 0.771 + 0.806 + 0.832 + 0.856 + 0.871)/6

0.8156666666666667

In [60]:
glist, label_dict = load_graphs('data/ogbn_graphs.bin')

In [50]:
glist[0]

Graph(num_nodes={'author': 17764, 'field_of_study': 14635, 'institution': 2276, 'paper': 27112},
      num_edges={('author', 'affiliated_with', 'institution'): 40307, ('author', 'writes', 'paper'): 101109, ('paper', 'cites', 'paper'): 19926, ('paper', 'has_topic', 'field_of_study'): 285074},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic')])

In [61]:
glist[1]

Graph(num_nodes={'author': 17764, 'field_of_study': 15186, 'institution': 2276, 'paper': 28584},
      num_edges={('author', 'affiliated_with', 'institution'): 40307, ('author', 'writes', 'paper'): 130156, ('paper', 'cites', 'paper'): 21276, ('paper', 'has_topic', 'field_of_study'): 299817},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic')])

In [3]:
glist, label_dict = load_graphs('data/ogbn_graphs.bin')
device = torch.device('cuda')
time_window = 6

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_MAG_data(glist, time_window, device)


loading mp2vec
generating train, val, test sets 


In [5]:
len(val_feats)

1

In [54]:
device = torch.device('cuda')
glist, _ = load_graphs('data/covid_graphs.bin')
time_window = 7

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_COVID_data(glist, time_window)


In [56]:
glist[0]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [9]:
train_feats[0].nodes('state')

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
       dtype=torch.int32)

In [3]:
sg, inverse_indices = dgl.khop_in_subgraph(train_feats[0], {'state': 2}, k=2, store_ids=True)

In [5]:
sg.num_edges()

13377

In [6]:
sg, inverse_indices = dgl.khop_in_subgraph(train_feats[0], {'state': [1,2]}, k=2, store_ids=True)

In [14]:
dic = {}
z = 0
# k = 0 
for stype, etype, dtype in sg.canonical_etypes:
    k = sg[stype, etype, dtype].number_of_edges()
    # k = sg[stype, etype, dtype].number_of_edges()
    dic[etype] = (z,z + k)
    z += k


In [15]:
dic

{'affiliate_r_t0': (0, 29),
 'affiliate_r_t1': (29, 58),
 'affiliate_r_t2': (58, 87),
 'affiliate_r_t3': (87, 116),
 'affiliate_r_t4': (116, 145),
 'affiliate_r_t5': (145, 174),
 'affiliate_r_t6': (174, 203),
 'nearby_county_t0': (203, 334),
 'nearby_county_t1': (334, 465),
 'nearby_county_t2': (465, 596),
 'nearby_county_t3': (596, 727),
 'nearby_county_t4': (727, 858),
 'nearby_county_t5': (858, 989),
 'nearby_county_t6': (989, 1120),
 'affiliate_t0': (1120, 1149),
 'affiliate_t1': (1149, 1178),
 'affiliate_t2': (1178, 1207),
 'affiliate_t3': (1207, 1236),
 'affiliate_t4': (1236, 1265),
 'affiliate_t5': (1265, 1294),
 'affiliate_t6': (1294, 1323),
 'nearby_state_t0': (1323, 1324),
 'nearby_state_t1': (1324, 1325),
 'nearby_state_t2': (1325, 1326),
 'nearby_state_t3': (1326, 1327),
 'nearby_state_t4': (1327, 1328),
 'nearby_state_t5': (1328, 1329),
 'nearby_state_t6': (1329, 1330)}

In [25]:
c = {type:0 for type in sg.etypes}

In [27]:
g = dgl.heterograph({
    ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
                                torch.tensor([0, 0, 1, 1])),
    ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
                                        torch.tensor([0, 1]))
    })
g = dgl.remove_edges(g, torch.tensor([0, 1]), 'plays',store_ids=True)
g.edges('all', etype='plays')

(tensor([1, 2]), tensor([1, 1]), tensor([0, 1]))

In [29]:
g.num_edges()

4

In [28]:
g.edata[dgl.EID]

{('developer', 'develops', 'game'): tensor([0, 1]),
 ('user', 'plays', 'game'): tensor([2, 3])}

# GNNExplainer

In [2]:
dataset = dgl.data.CoraGraphDataset()

Downloading /home/jiazhengli/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /home/jiazhengli/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.


In [13]:
dataset[0].ndata['feat'].float()

torch.Size([2708, 1433])

In [57]:
device = torch.device('cuda')
glist, _ = load_graphs('data/covid_graphs.bin')
time_window = 7

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_COVID_data(glist, time_window)

In [58]:
glist[0]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [59]:
glist[1]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [18]:
g = test_feats[0]
g

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r_t0', 'state'): 3141, ('county', 'affiliate_r_t1', 'state'): 3141, ('county', 'affiliate_r_t2', 'state'): 3141, ('county', 'affiliate_r_t3', 'state'): 3141, ('county', 'affiliate_r_t4', 'state'): 3141, ('county', 'affiliate_r_t5', 'state'): 3141, ('county', 'affiliate_r_t6', 'state'): 3141, ('county', 'nearby_county_t0', 'county'): 22176, ('county', 'nearby_county_t1', 'county'): 22176, ('county', 'nearby_county_t2', 'county'): 22176, ('county', 'nearby_county_t3', 'county'): 22176, ('county', 'nearby_county_t4', 'county'): 22176, ('county', 'nearby_county_t5', 'county'): 22176, ('county', 'nearby_county_t6', 'county'): 22176, ('state', 'affiliate_t0', 'county'): 3141, ('state', 'affiliate_t1', 'county'): 3141, ('state', 'affiliate_t2', 'county'): 3141, ('state', 'affiliate_t3', 'county'): 3141, ('state', 'affiliate_t4', 'county'): 3141, ('state', 'affiliate_t5', 'county'): 3141, ('state', 'affiliate

In [40]:
g.ndata['t0']['county'].shape

torch.Size([3223, 1])

In [46]:
g.ntypes

['county', 'state']

In [42]:
g.canonical_etypes

[('county', 'affiliate_r_t0', 'state'),
 ('county', 'affiliate_r_t1', 'state'),
 ('county', 'affiliate_r_t2', 'state'),
 ('county', 'affiliate_r_t3', 'state'),
 ('county', 'affiliate_r_t4', 'state'),
 ('county', 'affiliate_r_t5', 'state'),
 ('county', 'affiliate_r_t6', 'state'),
 ('county', 'nearby_county_t0', 'county'),
 ('county', 'nearby_county_t1', 'county'),
 ('county', 'nearby_county_t2', 'county'),
 ('county', 'nearby_county_t3', 'county'),
 ('county', 'nearby_county_t4', 'county'),
 ('county', 'nearby_county_t5', 'county'),
 ('county', 'nearby_county_t6', 'county'),
 ('state', 'affiliate_t0', 'county'),
 ('state', 'affiliate_t1', 'county'),
 ('state', 'affiliate_t2', 'county'),
 ('state', 'affiliate_t3', 'county'),
 ('state', 'affiliate_t4', 'county'),
 ('state', 'affiliate_t5', 'county'),
 ('state', 'affiliate_t6', 'county'),
 ('state', 'nearby_state_t0', 'state'),
 ('state', 'nearby_state_t1', 'state'),
 ('state', 'nearby_state_t2', 'state'),
 ('state', 'nearby_state_t3', 'st

In [19]:
hg = dgl.to_homogeneous(g)
hg

Graph(num_nodes=3274, num_edges=201089,
      ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int32), '_TYPE': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int32), '_TYPE': Scheme(shape=(), dtype=torch.int64)})

In [38]:
hg.ndata[dgl.NTYPE]

tensor([0, 0, 0,  ..., 1, 1, 1])

# New dataset

In [63]:
from ogb.linkproppred import DglLinkPropPredDataset

dataset = DglLinkPropPredDataset(name = 'ogbl-collab')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]

Downloading http://snap.stanford.edu/ogb/data/linkproppred/collab.zip


Downloaded 0.11 GB: 100%|██████████| 117/117 [00:09<00:00, 11.81it/s]


Extracting dataset/collab.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 44.89it/s]


Converting graphs into DGL objects...


100%|██████████| 1/1 [00:00<00:00, 412.66it/s]

Saving...





In [64]:
graph

Graph(num_nodes=235868, num_edges=2358104,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32)}
      edata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'weight': Scheme(shape=(1,), dtype=torch.int64)})

In [None]:
def load_acm(remove_self_loop):
    url = 'dataset/ACM3025.pkl'
    data_path = get_download_dir() + '/ACM3025.pkl'
    download(_get_dgl_url(url), path=data_path)

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    labels = labels.nonzero()[:, 1]

    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes))
        data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes))

    # Adjacency matrices for meta path based neighbors
    # (Mufei): I verified both of them are binary adjacency matrices with self loops
    author_g = dgl.from_scipy(data['PAP'])
    subject_g = dgl.from_scipy(data['PLP'])
    gs = [author_g, subject_g]

    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = author_g.number_of_nodes()
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    print('dataset loaded')
    pprint({
        'dataset': 'ACM',
        'train': train_mask.sum().item() / num_nodes,
        'val': val_mask.sum().item() / num_nodes,
        'test': test_mask.sum().item() / num_nodes
    })

    return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \
           train_mask, val_mask, test_mask

In [11]:
r = np.array([0.803 , 0.841 , 0.861 , 0.877 , 0.897 , 0.907])
np.mean(r)

0.8643333333333333