In [1]:
'''
This code opens the Microsoft Academic Graph Dataset and trains HGT
Based on code provided by original HGT paper
'''
import torch
from hgt import *
from hgt_utils import *
from local_access import *
from ogb.nodeproppred import PygNodePropPredDataset
from ogb.nodeproppred import Evaluator
from graph import Graph
import argparse
import numpy as np

print("Microsoft Academic Graph Dataset Experiment")

Microsoft Academic Graph Dataset Experiment


In [None]:
'''
Data Preprocessing
ogbn-mag only comes with paper node features, thus for other nodes types we take the average
of connected paper nodes as input features. 
'''
print("Begin Data Preprocessing")
print("")
print("Retrieving Data from Open Graph Benchmark ...")

# Get dataset using Pytorch Geometric Loader
dataset = PygNodePropPredDataset(name='ogbn-mag')
print("... Retrieval complete")
data = dataset[0] # pyg graph object
evaluator = Evaluator(name='ogbn-mag')


# Populating edge lists in Graph object based on edge_list
print("Populating edge lists into Graph object")
edge_index_dict = data.edge_index_dict 
graph = Graph()
edg = graph.edge_list
years = data.node_year['paper'].t().numpy()[0]
# for every type of edge relation i.e. ('author', 'affiliated_with', 'institution'), ...
for key in edge_index_dict:
    print(key) # print relation name
    edges = edge_index_dict[key] 
    '''
    tensor( [[      0,       1,       2,  ..., 1134645, 1134647, 1134648],
             [    845,     996,    3197,  ...,    5189,    4668,    4668]]) example edges tensor
    '''
    # getting types of source, relation and edge ('author', 'affiliated_with', 'institution')
    s_type, r_type, t_type = key[0], key[1], key[2]
    elist = edg[t_type][s_type][r_type]
    rlist = edg[s_type][t_type]['rev_' + r_type]
    # adding year if the type is paper
    for s_id, t_id in edges.t().tolist():
        year = None
        if s_type == 'paper':
            year = years[s_id]
        elif t_type == 'paper':
            year = years[t_id]
        elist[t_id][s_id] = year
        rlist[s_id][t_id] = year

# Reformatting edge list and computing node degrees
print("Reformatting edge lists and computing node degrees")
edg = {}
deg = {key : np.zeros(data.num_nodes_dict[key]) for key in data.num_nodes_dict}
for k1 in graph.edge_list:
    if k1 not in edg:
        edg[k1] = {}
    for k2 in graph.edge_list[k1]:
        if k2 not in edg[k1]:
            edg[k1][k2] = {}
        for k3 in graph.edge_list[k1][k2]:
            if k3 not in edg[k1][k2]:
                edg[k1][k2][k3] = {}
            for e1 in graph.edge_list[k1][k2][k3]:
                if len(graph.edge_list[k1][k2][k3][e1]) == 0:
                    continue

                edg[k1][k2][k3][e1] = {}
                for e2 in graph.edge_list[k1][k2][k3][e1]:
                    edg[k1][k2][k3][e1][e2] = graph.edge_list[k1][k2][k3][e1][e2]
                deg[k1][e1] += len(edg[k1][k2][k3][e1])
            print(k1, k2, k3, len(edg[k1][k2][k3]))
graph.edge_list = edg # inserting new edge list into Graph object

# Constructing node feature vectors for each node type in graph
print("Constructing node feature vectors for each node type in graph")
paper_node_features = data.x_dict['paper'].numpy() # data into numpy
# append log degree to get full paper node features
graph.node_feature['paper'] = np.concatenate((paper_node_features, np.log10(deg['paper'].reshape(-1, 1))), axis=-1)
# These are node types: {'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389}
for node_type in data.num_nodes_dict:
    print(node_type)
    if node_type not in ['paper', 'institution']:
        i = []
        for rel_type in graph.edge_list[node_type]['paper']:
            for t in graph.edge_list[node_type]['paper'][rel_type]:
                for s in graph.edge_list[node_type]['paper'][rel_type][t]:
                    i += [[t,s]]
            if len(i) == 0:
                continue
        i = np.array(i).T
        v = np.ones(i.shape[1])
        m = normalize(sp.coo_matrix((v, i), \
            shape=(data.num_nodes_dict[node_type], data.num_nodes_dict['paper'])))
        out = m.dot(paper_node_features)
        graph.node_feature[node_type] = np.concatenate((out, np.log10(deg[node_type].reshape(-1, 1))), axis=-1)

# Contructing node feature vectors for institution nodes
print("Constructing Node features for institutions")    
cv = graph.node_feature['author'][:, :-1]
i = []
for _rel in graph.edge_list['institution']['author']:
    for j in graph.edge_list['institution']['author'][_rel]:
        for t in graph.edge_list['institution']['author'][_rel][j]:
            i += [[j, t]]
i = np.array(i).T
v = np.ones(i.shape[1])
m = normalize(sp.coo_matrix((v, i), \
    shape=(data.num_nodes_dict['institution'], data.num_nodes_dict['author'])))
out = m.dot(cv)
graph.node_feature['institution'] = np.concatenate((out, np.log10(deg['institution'].reshape(-1, 1))), axis=-1)      

# y_dict
y = data.y_dict['paper'].t().numpy()[0]

# Splitting dataset into training, validation and testing
print("Splitting dataset into train, val and test")
split_idx = dataset.get_idx_split()
train_paper = split_idx['train']['paper'].numpy()
valid_paper = split_idx['valid']['paper'].numpy()
test_paper  = split_idx['test']['paper'].numpy()

graph.y = y
graph.train_paper = train_paper
graph.valid_paper = valid_paper
graph.test_paper  = test_paper
graph.years       = years

print("Creating Masks")
graph.train_mask = np.zeros(len(graph.node_feature['paper']), dtype=bool)
graph.train_mask[graph.train_paper] = True

graph.valid_mask = np.zeros(len(graph.node_feature['paper']), dtype=bool)
graph.valid_mask[graph.valid_paper] = True

graph.test_mask = np.zeros(len(graph.node_feature['paper']),  dtype=bool)
graph.test_mask[graph.test_paper] = True

# Preprocessing graph object is now complete
print("Preprocessing complete")

In [None]:
'''
Creating Model
'''
print("Creating Model")
hgt_GNN = HGTModel(len(graph.node_feature['paper'][0]), # input_dim
                   256,                                 # hidden_dim
                   len(graph.get_types()),              # num_node_types
                   len(graph.get_meta_graph()),         # num_edge_types
                   8,                                   # num_heads
                   4,                                   # num_layers
                   0.2,                                 # dropout
                   prev_norm = True,                    # normalization on all but last layer
                   last_norm = False,                   # normalization on last layer
                   use_rte = True)                      # use relative temporal encoding
classifier = Classifier(256, graph.y.max()+1)

HGT_classifier = nn.Sequential(hgt_GNN, classifier)
print(HGT_classifier)