In [1]:
'''
This code opens the Microsoft Academic Graph Dataset and trains HGT
Based on code provided by original HGT paper
'''
import torch
from hgt import *
from hgt_utils import *
from model import *
from torch_geometric.loader import DataLoader
from ogb.nodeproppred import PygNodePropPredDataset
from ogb.nodeproppred import Evaluator
import multiprocessing as mp
import numpy as np
import time
import pandas as pd
import matplotlib as plt
import seaborn as sb
import sys

print("Microsoft Academic Graph Dataset Experiment")

Microsoft Academic Graph Dataset Experiment


In [2]:
print("Retrieving Data from Open Graph Benchmark ...")

# Get dataset using Pytorch Geometric Loader
dataset = PygNodePropPredDataset(name='ogbn-mag')
print("... Retrieval complete")
data = dataset[0] # pyg graph object
print(data)

Retrieving Data from Open Graph Benchmark ...
... Retrieval complete
Data(
  num_nodes_dict={
    author=1134649,
    field_of_study=59965,
    institution=8740,
    paper=736389,
  },
  edge_index_dict={
    (author, affiliated_with, institution)=[2, 1043998],
    (author, writes, paper)=[2, 7145660],
    (paper, cites, paper)=[2, 5416271],
    (paper, has_topic, field_of_study)=[2, 7505078],
  },
  x_dict={ paper=[736389, 128] },
  node_year={ paper=[736389, 1] },
  edge_reltype={
    (author, affiliated_with, institution)=[1043998, 1],
    (author, writes, paper)=[7145660, 1],
    (paper, cites, paper)=[5416271, 1],
    (paper, has_topic, field_of_study)=[7505078, 1],
  },
  y_dict={ paper=[736389, 1] }
)


In [3]:
input_dim = data.x_dict['paper'][0].shape[0]
hidden_dim = 512
num_node_types = len(data.num_nodes_dict)
num_edge_types = len(data.edge_index_dict)
num_heads = 8
num_layers = 4
dropout = 0.2
classifier_output_dim = 349

In [4]:
print("Creating Model")
hgt_GNN = HGTModel(input_dim,                           # input_dim
                   hidden_dim,                          # hidden_dim
                   num_node_types,                      # num_node_types
                   num_edge_types,                      # num_edge_types
                   num_heads,                           # num_heads
                   num_layers,                          # num_layers
                   dropout,                             # dropout
                   prev_norm = True,                    # normalization on all but last layer
                   last_norm = False,                   # normalization on last layer
                   use_rte = False)                     # use relative temporal encoding 
classifier = Classifier(256, 359)
HGT_classifier = nn.Sequential(hgt_GNN, classifier)

print(HGT_classifier)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(HGT_classifier.parameters(), lr=0.001)

Creating Model
Sequential(
  (0): HGTModel(
    (adapt_features): ModuleList(
      (0-3): 4 x Linear(in_features=128, out_features=512, bias=True)
    )
    (hgt_layers): ModuleList(
      (0-3): 4 x HGTLayer()
    )
    (drop): Dropout(p=0.2, inplace=False)
  )
  (1): Classifier(n_hid=256, n_out=359)
)


In [None]:
node_features = data.x_dict['paper'] 
node_types = torch.zeros(data.num_nodes_dict['paper'], dtype=torch.long) # paper node type
edge_index = data.edge_index_dict[('paper', 'cites', 'paper')]
edge_types = data.edge_reltype[(('paper', 'cites', 'paper'))]  
edge_time = None # temporal data not available
target = data.y_dict['paper']

# Generate batch
# Use neighbor sampler for fixed size sampled subgraphs
train_loader = NeighborSampler(data.edge_index_dict, 
                                node_idx=data.train_mask,
                                sizes=[10,10], 
                                batch_size=1024,
                                shuffle=True,
                                num_workers=12)

for batch in train_loader:
    batch_node_feat = node_features[batch.n_id]
    batch_node_type = node_types[batch.n_id]
    batch_edge_index = batch.edge_index
    batch_edge_type = edge_types[batch.edge_index]
    batch_target = target[batch.n_id]