In [1]:
'''
This code opens the Microsoft Academic Graph Dataset and trains HGT
Based on code provided by original HGT paper
'''
import torch
from hgt import *
from hgt_utils import *
from local_access import *
from ogb.nodeproppred import PygNodePropPredDataset
from ogb.nodeproppred import Evaluator
from graph import Graph
import multiprocessing as mp
import argparse
import numpy as np
import time
import pandas as pd
import matplotlib as plt
import seaborn as sb

print("Microsoft Academic Graph Dataset Experiment")

Microsoft Academic Graph Dataset Experiment


In [None]:
'''
Data Preprocessing
ogbn-mag only comes with paper node features, thus for other nodes types we take the average
of connected paper nodes as input features. 
'''
print("Begin Data Preprocessing")
print("")
print("Retrieving Data from Open Graph Benchmark ...")

# Get dataset using Pytorch Geometric Loader
dataset = PygNodePropPredDataset(name='ogbn-mag')
print("... Retrieval complete")
data = dataset[0] # pyg graph object
evaluator = Evaluator(name='ogbn-mag')


# Populating edge lists in Graph object based on edge_list
print("Populating edge lists into Graph object")
edge_index_dict = data.edge_index_dict 
graph = Graph()
edg = graph.edge_list
years = data.node_year['paper'].t().numpy()[0]
# for every type of edge relation i.e. ('author', 'affiliated_with', 'institution'), ...
for key in edge_index_dict:
    print(key) # print relation name
    edges = edge_index_dict[key] 
    '''
    tensor( [[      0,       1,       2,  ..., 1134645, 1134647, 1134648],
             [    845,     996,    3197,  ...,    5189,    4668,    4668]]) example edges tensor
    '''
    # getting types of source, relation and edge ('author', 'affiliated_with', 'institution')
    s_type, r_type, t_type = key[0], key[1], key[2]
    elist = edg[t_type][s_type][r_type]
    rlist = edg[s_type][t_type]['rev_' + r_type]
    # adding year if the type is paper
    for s_id, t_id in edges.t().tolist():
        year = None
        if s_type == 'paper':
            year = years[s_id]
        elif t_type == 'paper':
            year = years[t_id]
        elist[t_id][s_id] = year
        rlist[s_id][t_id] = year

# Reformatting edge list and computing node degrees
print("Reformatting edge lists and computing node degrees")
edg = {}
deg = {key : np.zeros(data.num_nodes_dict[key]) for key in data.num_nodes_dict}
for k1 in graph.edge_list:
    if k1 not in edg:
        edg[k1] = {}
    for k2 in graph.edge_list[k1]:
        if k2 not in edg[k1]:
            edg[k1][k2] = {}
        for k3 in graph.edge_list[k1][k2]:
            if k3 not in edg[k1][k2]:
                edg[k1][k2][k3] = {}
            for e1 in graph.edge_list[k1][k2][k3]:
                if len(graph.edge_list[k1][k2][k3][e1]) == 0:
                    continue

                edg[k1][k2][k3][e1] = {}
                for e2 in graph.edge_list[k1][k2][k3][e1]:
                    edg[k1][k2][k3][e1][e2] = graph.edge_list[k1][k2][k3][e1][e2]
                deg[k1][e1] += len(edg[k1][k2][k3][e1])
            print(k1, k2, k3, len(edg[k1][k2][k3]))
graph.edge_list = edg # inserting new edge list into Graph object

# Constructing node feature vectors for each node type in graph
print("Constructing node feature vectors for each node type in graph")
paper_node_features = data.x_dict['paper'].numpy() # data into numpy
# append log degree to get full paper node features
graph.node_feature['paper'] = np.concatenate((paper_node_features, np.log10(deg['paper'].reshape(-1, 1))), axis=-1)
# These are node types: {'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389}
for node_type in data.num_nodes_dict:
    print(node_type)
    if node_type not in ['paper', 'institution']:
        i = []
        for rel_type in graph.edge_list[node_type]['paper']:
            for t in graph.edge_list[node_type]['paper'][rel_type]:
                for s in graph.edge_list[node_type]['paper'][rel_type][t]:
                    i += [[t,s]]
            if len(i) == 0:
                continue
        i = np.array(i).T
        v = np.ones(i.shape[1])
        m = normalize(sp.coo_matrix((v, i), \
            shape=(data.num_nodes_dict[node_type], data.num_nodes_dict['paper'])))
        out = m.dot(paper_node_features)
        graph.node_feature[node_type] = np.concatenate((out, np.log10(deg[node_type].reshape(-1, 1))), axis=-1)

# Contructing node feature vectors for institution nodes
print("Constructing Node features for institutions")    
cv = graph.node_feature['author'][:, :-1]
i = []
for _rel in graph.edge_list['institution']['author']:
    for j in graph.edge_list['institution']['author'][_rel]:
        for t in graph.edge_list['institution']['author'][_rel][j]:
            i += [[j, t]]
i = np.array(i).T
v = np.ones(i.shape[1])
m = normalize(sp.coo_matrix((v, i), \
    shape=(data.num_nodes_dict['institution'], data.num_nodes_dict['author'])))
out = m.dot(cv)
graph.node_feature['institution'] = np.concatenate((out, np.log10(deg['institution'].reshape(-1, 1))), axis=-1)      

# y_dict
y = data.y_dict['paper'].t().numpy()[0]

# Splitting dataset into training, validation and testing
print("Splitting dataset into train, val and test")
split_idx = dataset.get_idx_split()
train_paper = split_idx['train']['paper'].numpy()
valid_paper = split_idx['valid']['paper'].numpy()
test_paper  = split_idx['test']['paper'].numpy()

graph.y = y
graph.train_paper = train_paper
graph.valid_paper = valid_paper
graph.test_paper  = test_paper
graph.years       = years

print("Creating Masks")
graph.train_mask = np.zeros(len(graph.node_feature['paper']), dtype=bool)
graph.train_mask[graph.train_paper] = True

graph.valid_mask = np.zeros(len(graph.node_feature['paper']), dtype=bool)
graph.valid_mask[graph.valid_paper] = True

graph.test_mask = np.zeros(len(graph.node_feature['paper']),  dtype=bool)
graph.test_mask[graph.test_paper] = True

# Preprocessing graph object is now complete
print("Preprocessing complete")

In [None]:
'''
Creating Model
'''
print("Creating Model")
hgt_GNN = HGTModel(len(graph.node_feature['paper'][0]), # input_dim
                   256,                                 # hidden_dim
                   len(graph.get_types()),              # num_node_types
                   len(graph.get_meta_graph()),         # num_edge_types
                   8,                                   # num_heads
                   4,                                   # num_layers
                   0.2,                                 # dropout
                   prev_norm = True,                    # normalization on all but last layer
                   last_norm = False,                   # normalization on last layer
                   use_rte = False)                     # use relative temporal encoding 
classifier = Classifier(256, graph.y.max()+1)

HGT_classifier = nn.Sequential(hgt_GNN, classifier)

print(HGT_classifier)

In [None]:
'''
Preprocessing data

'''
batch_number = 32 # number of sampled graphs for each epoch

batch_size = 128

num_epochs = 10

num_workers = 8

clip = 1.0

sample_depth = 6

sample_width = 520

plot = False # True or false to plot data

save_directory = r"C:\Users\johns\OneDrive\Desktop\HGT_Data\models"
# device = torch.device() DEVICE CONTROL POSSIBLE
target_nodes = np.arange(len(graph.node_feature['paper']))

# Negative Log Likelihood Loss
criterion = nn.NLLLoss()

# Get list of model parameters w/ associated names
parameters_optimizer = list(HGT_classifier.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in parameters_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in parameters_optimizer if any(nd in n for nd in no_decay)],     'weight_decay': 0.0}
]

# AdamW optimizer w/specified parameter groups and epsilon value
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, eps=1e-06)

# Create a OneCycleLR learning rate scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, pct_start=0.05, anneal_strategy='linear', final_div_factor=10,\
                        max_lr = 5e-4, total_steps = batch_size * num_epochs + 1)

stats = []
result = []
best_val = 0
training_step = 0

def randint():
    return np.random.randint(2**31 - 1)

def ogbn_sample(seed, samp_nodes):
    np.random.seed(seed)
    ylabel      = torch.LongTensor(graph.y[samp_nodes])
    feature, times, edge_list, indxs, _ = sample_subgraph(graph, \
                inp = {'paper': np.concatenate([samp_nodes, graph.years[samp_nodes]]).reshape(2, -1).transpose()}, \
                sampled_depth = sample_depth, sampled_number = sample_width, \
                    feature_extractor = feature_MAG)
    node_feature, node_type, edge_time, edge_index, edge_type, node_dict, edge_dict = \
            to_torch(feature, times, edge_list, graph)
    train_mask = graph.train_mask[indxs['paper']]
    valid_mask = graph.valid_mask[indxs['paper']]
    test_mask  = graph.test_mask[indxs['paper']]
    ylabel     = graph.y[indxs['paper']]
    return node_feature, node_type, edge_time, edge_index, edge_type, (train_mask, valid_mask, test_mask), ylabel

def prepare_data(pool, task_type = 'train', s_idx = 0, n_batch = batch_number, batch_size = batch_size):
    '''
        Sampled and prepare training and validation data using multi-process parallization.
    '''
    jobs = []
    if task_type == 'train':
        for batch_id in np.arange(n_batch):
            print(f'starting preprocessing batch: {batch_id}')
            p = pool.apply_async(ogbn_sample, args=([randint(), \
                            np.random.choice(target_nodes, batch_size, replace = False)]))
            jobs.append(p)
            print(f'finished preprocessing batch: {batch_id}')
    elif task_type == 'sequential':
        for i in np.arange(n_batch):
            target_papers = graph.test_paper[(s_idx + i) * batch_size : (s_idx + i + 1) * batch_size]
            p = pool.apply_async(ogbn_sample, args=([randint(), target_papers]))
            jobs.append(p)
    elif task_type == 'variance_reduce':
        target_papers = graph.test_paper[s_idx * batch_size : (s_idx + 1) * batch_size]
        for batch_id in np.arange(n_batch):
            p = pool.apply_async(ogbn_sample, args=([randint(), target_papers]))
            jobs.append(p)
    print("Preprocessing complete ---------------------------------------------------")
    return jobs

pool = mp.Pool(num_workers)

# start time
start_time = time.time()
jobs = prepare_data(pool)

# Begin Training
print("Beginning Training")
print("")
train_step = 0
epoch_tracker = 1
print("starting loop")

print(f'length of jobs is: {len(jobs)}')
print(f'type of jobs is: {type}')

for epoch in np.arange(num_epochs) + 1:
    print(f"epoch: {epoch_tracker}")
    '''
        Prepare Training and Validation Data
    '''
    datas = [job.get() for job in jobs]
    pool.close()
    pool.join()
    '''
        After the data is collected, close the pool and then reopen it.
    '''
    pool = mp.Pool(num_workers)
    jobs = prepare_data(pool)
    et = time.time()
    print('Data Preparation: %.1fs' % (et - st))
    
    '''
        Train
    '''
    print("Training")
    HGT_classifier.train()
    stat = []
    for node_feature, node_type, edge_time, edge_index, edge_type, (train_mask, valid_mask, test_mask), ylabel in datas:
        node_rep = HGT_classifier.forward(node_feature, node_type, \
                               edge_time, edge_index, edge_type)
        ylabel = torch.LongTensor(ylabel)
        train_res  = classifier.forward(node_rep[:len(ylabel)][train_mask])
        valid_res  = classifier.forward(node_rep[:len(ylabel)][valid_mask])
        test_res   = classifier.forward(node_rep[:len(ylabel)][test_mask])

        train_loss = criterion(train_res, ylabel[train_mask])

        optimizer.zero_grad() 
        train_loss.backward()

        torch.nn.utils.clip_grad_norm_(HGT_classifier.parameters(), clip)
        optimizer.step()

        train_step += 1
        scheduler.step(train_step)

        train_acc  = evaluator.eval({
                        'y_true': ylabel[train_mask].unsqueeze(-1),
                        'y_pred': train_res.argmax(dim=1).unsqueeze(-1)
                    })['acc']
        valid_acc  = evaluator.eval({
                        'y_true': ylabel[valid_mask].unsqueeze(-1),
                        'y_pred': valid_res.argmax(dim=1).unsqueeze(-1)
                    })['acc']
        test_acc   = evaluator.eval({
                        'y_true': ylabel[test_mask].unsqueeze(-1),
                        'y_pred': test_res.argmax(dim=1).unsqueeze(-1)
                    })['acc']
        stat += [[train_loss.item(), train_acc, valid_acc, test_acc]]
        del node_rep, train_loss, ylabel
    stats += [stat]
    avgs = np.average(stat, axis=0)
    if avgs[2] > best_val:
        best_val = avgs[2]
        torch.save(HGT_classifier.state_dict(), save_directory)
        print('UPDATE!!!  ' + str(best_val))
    print('Epoch: %d LR: %.5f Train Loss: %.4f Train Acc: %.4f Valid Acc: %.4f Test Acc: %.4f' % \
         (epoch,  optimizer.param_groups[0]['lr'], avgs[0], avgs[1], avgs[2], avgs[3]))
    st = time.time()
    if plot and epoch % 50 == 0:
        s = np.concatenate(stats)
        for i in range(4):
            data = np.stack((s[-batch_number * 100:, i], np.arange(len(s[-batch_number * 100:, i])) // batch_number), axis=0).transpose()
            sb.lineplot(data = pd.DataFrame(data, columns = ['Value', 'Epoch']), x='Epoch', y='Value')
            plt.show()
            
if plot:
    s = np.concatenate(stats)
    for i in range(4):
        data = np.stack((s[batch_number * 100:, i], np.arange(len(s[batch_number * 100:, i])) // batch_number), axis=0).transpose()
        sb.lineplot(data = pd.DataFrame(data, columns = ['Value', 'Epoch']), x='Epoch', y='Value')
        plt.show()