# Description

We train a simple GNN comprising of a user-specified number of GCN layers and hidden dimension. The [ogbn-arxiv odataset](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv) comes from [Open Graph Benchmark](https://ogb.stanford.edu/).

The goal is not to reach the state-of-the-art performance as deep GNN models, but rather illustrate how GNN is trained using the [PyG](https://pytorch-geometric.readthedocs.io/en/latest/) package.

In [1]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-6a69925c-1d2b-7bf8-6d53-39cfaafd42af)


In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install import_ipynb
!pip install ogb
!pip install GPUtil

1.13.1+cu116
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting import_ipynb
  Downloading import_ipynb-0.1.4-py3-none-any.whl (4.1 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.wh

In [1]:
from torch_geometric.data import Data
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import numpy as np
import argparse
import torch
import sys
import importlib as ipb
import pickle
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, ChebConv
import torch.optim.lr_scheduler as lr_scheduler
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
import GPUtil
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]
      
    def pickle(self, key_save):
        f = open(key_save, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    def unpickle(self, key_save):
        with open(key_save, 'rb') as f:
            return pickle.load(f)

    def add_result(self, run, result):
        assert len(result) == 3
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Valid: {result[:, 1].max():.2f}')
            print(f'Highest Test: {result[:, 2].max():.2f}')
            print(f'  Final Train: {result[argmax, 0]:.2f}')
            print(f'  Final Valid: {result[argmax, 1]:.2f}')
            print(f'   Final Test: {result[argmax, 2]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                test = r[:, 2].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                test2 = r[r[:, 1].argmax(), 2].item()
                best_results.append((train, valid, test, train2, test2))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'Highest Test: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'  Final Valid: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 4]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')


def test(model, data_train, data, split_idx, evaluator):
    model.eval()
    with torch.no_grad():
        train_out=model(data_train.x, data_train.edge_index)
        y_pred_train=train_out.argmax(dim=-1, keepdim=True)
        out = model(data.x, data.adj_t)
        y_pred = out.argmax(dim=-1, keepdim=True)
        train_acc = evaluator.eval({
            'y_true': data_train.y,
            'y_pred': y_pred_train,
        })['acc']
        valid_acc = evaluator.eval({
            'y_true': data.y[split_idx['valid']],
            'y_pred': y_pred[split_idx['valid']],
        })['acc']
        test_acc = evaluator.eval({
            'y_true': data.y[split_idx['test']],
            'y_pred': y_pred[split_idx['test']],
        })['acc']
    return train_acc, valid_acc, test_acc

In [3]:
def slide_idx(data, indices):
    full_mat = data.adj_t.to_scipy().tocsr()
    coo = full_mat[indices][:, indices].tocoo() # This is wrong, because it omits "papers" in the future
    values = coo.data
    sub_indices = np.vstack((coo.row, coo.col))
    i = torch.LongTensor(sub_indices)
    v = torch.FloatTensor(values)
    shape = coo.shape
    sub_idx = torch.sparse.FloatTensor(i, v, torch.Size(shape)).coalesce().indices()
    sub_x = data.x[indices]
    sub_y = data.y[indices]
    return Data(x=sub_x, y=sub_y, edge_index=sub_idx).to(device)

def mem_report():
    if device.type == 'cuda':
        GPUs = GPUtil.getGPUs()
        for i, gpu in enumerate(GPUs):
            print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(
                i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    else:
        print("CPU RAM Free: "
              + humanize.naturalsize(psutil.virtual_memory().available))

In [4]:
def train(model, data, args):
    model.train()
    # No batch
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out, data.y.squeeze(1))
    loss.backward()
    return loss.item()

In [6]:
class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout, FC=False):
        super(GNN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.FC = FC
        if self.FC:
            self.convs.append(nn.Linear(in_channels, hidden_channels))
        else:
            self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            if self.FC:
                self.convs.append(nn.Linear(hidden_channels, hidden_channels))
            else:
                self.convs.append(
                    GCNConv(hidden_channels, hidden_channels, cached=True))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        if self.FC:
            self.convs.append(nn.Linear(hidden_channels, out_channels))
        else:
            self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))
        self.dropout = dropout

    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            if i > 0:
                x = self.bns[i-1](x)
            if self.FC:
                x = conv(x)
            else:
                x = conv(x, edge_index)
            x = F.relu(x)
            if i == len(self.convs[:-1])-1:
                x = F.dropout(x, p=self.dropout, training=self.training)
        return x.log_softmax(dim=-1)

In [7]:
example_mod = GNN(10,50,10,3,False)
example_mod

GNN(
  (convs): ModuleList(
    (0): GCNConv(10, 50)
    (1): GCNConv(50, 50)
    (2): GCNConv(50, 10)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [8]:
lr = 0.001
optim_name = 'Adam'
hidden_channels = 512
use_SVI = False
FC = False  
change_ratio = False  
ratio_mult = 0.25 if change_ratio else 1
if __name__ == "__main__":  
    result_dict = {'SVI-SGD': [], 'SVI-Adam': [], 'SGD': [], 'Adam': []}
    parser = argparse.ArgumentParser(
        description='OGBN-Arxiv (GNN)')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=4)
    parser.add_argument('--dropout', type=float, default=0.25)
    parser.add_argument('--lr', type=float, default=lr)
    parser.add_argument('--momentum', type=float, default=0.95)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--batch', type=int, default=1)
    parser.add_argument('--runs', type=int, default=3)
    parser.add_argument('--SVI', type=bool, default=use_SVI)
    parser.add_argument(
        '--optimizer', type=str, default=optim_name)
    parser.add_argument('-f')
    args = parser.parse_args()
    args.FC = FC # If use fully-connected nets instead of GCN layers
    args.change_ratio = change_ratio # If we change ratio of training and test (only used if "ratio_mult" < 1)
    args.ratio_mult = ratio_mult  # Only use X% of training data
    args.lr_drop = 0.98
    args.dec_epoch = 100
    if 1e-3 < args.lr and args.lr <= 1e-2:
        args.lr_drop = 0.96
    # args.hidden_channels = 512 if args.num_layers >= 3 else 1000
    args.hidden_channels = hidden_channels
    print(args)
    dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                    transform=T.ToSparseTensor())
    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)
    split_idx = dataset.get_idx_split()
    # Split train to train sub and valid
    # Always do because we do not want to "peak" into test features
    train_full = split_idx['train']
    N = len(train_full)
    np.random.seed(1103)
    rand_idx = np.sort(np.random.choice(N, int(N*args.ratio_mult), replace=False))
    split_idx['train'] = split_idx['train'][rand_idx]
    data_train=slide_idx(data, split_idx['train'])
    logger = Logger(args.runs, args)
    results_over_runs = {}
    for run in range(args.runs):
        accu_at_run = []
        args.SVI = use_SVI
        torch.manual_seed(1103 + run)
        model = GNN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout, args.FC).to(device)
        evaluator = Evaluator(name='ogbn-arxiv')
        if args.optimizer == 'SGD':
            optimizer = torch.optim.SGD(
                model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True)
        else:
            optimizer = torch.optim.Adam(
                model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            if device.type == 'cuda':
                # Useful to avoid GPU allocation excess
                torch.cuda.empty_cache()
            print(f"LR is {optimizer.param_groups[0]['lr']}")
            optimizer.zero_grad()
            # epoch_stop_SVI = 50
            epoch_stop_SVI = 1000
            if epoch == epoch_stop_SVI + 1 and args.SVI:
                # Reinitialize optimizer to avoid gradient issue
                args.SVI = False
                sdict = model.state_dict()
                print(
                    '############ Pause SVI from now on ############')
                model = GNN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout, args.FC).to(device)
                model.load_state_dict(sdict)
                model = model.to(device)
                optimizer = torch.optim.Adam(
                    model.parameters(), lr=args.lr)
            if args.SVI:
                print(f'SVI-{args.optimizer} training')
                loss = train_SVI(model, data_train, args)
            else:
                print(f'{args.optimizer} training')
                loss = train(model, data_train, args)
            optimizer.step()
            mem_report()
            # if epoch > args.dec_epoch:
            #     for p in optimizer.param_groups:
            #         p['lr'] *= args.lr_drop
            print('Testing')
            result = test(model, data_train, data, split_idx, evaluator)
            mem_report()
            logger.add_result(run, result)
            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                accu_at_run += [[train_acc, valid_acc, test_acc]]
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        # Running np.array(accu_at_run) would make it into Epoch-by-3 matrices, but doing so causes .json saving error so I just use the list version
        results_over_runs[f'lr={args.lr}@Run{run+1}'] = accu_at_run
        logger.print_statistics(run)
        # Save results
        key = f'SVI-{optim_name}' if use_SVI else optim_name 
        fc_use = '-FC' if args.FC else ''
        c_ratio = '-change_ratio' if args.change_ratio else ''
        ratio = args.ratio_mult if args.ratio_mult < 1 else ''
        key_save = f'SVI-{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR{fc_use}{c_ratio}{ratio}_correct_split' if use_SVI else f'{optim_name}-{args.num_layers}layers-{args.hidden_channels}nodes-{args.lr}LR{fc_use}{c_ratio}{ratio}_correct_split'
        logger.pickle(key_save) # Save it to file, but need not now because only one run.
        result_dict[key].append(results_over_runs)
        with open(f"{key_save}.json", "w") as outfile:
            json.dump(result_dict, outfile)
    logger.print_statistics()

Namespace(FC=False, SVI=False, batch=1, change_ratio=False, dec_epoch=100, dropout=0.25, epochs=5, f='/root/.local/share/jupyter/runtime/kernel-39b682c3-ad09-4f3a-a3c4-7fdfa7129cb9.json', hidden_channels=512, log_steps=1, lr=0.001, lr_drop=0.98, momentum=0.95, num_layers=4, optimizer='Adam', ratio_mult=1, runs=3)




LR is 0.001
Adam training
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Testing
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Run: 01, Epoch: 01, Loss: 4.0162, Train: 11.79%, Valid: 23.30% Test: 21.87%
LR is 0.001
Adam training
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Testing
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Run: 01, Epoch: 02, Loss: 2.8619, Train: 25.56%, Valid: 31.64% Test: 34.01%
LR is 0.001
Adam training
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Testing
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Run: 01, Epoch: 03, Loss: 2.2463, Train: 17.99%, Valid: 18.81% Test: 24.98%
LR is 0.001
Adam training
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Testing
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Run: 01, Epoch: 04, Loss: 1.9026, Train: 17.14%, Valid: 18.46% Test: 24.63%
LR is 0.001
Adam training
GPU 0 ... Mem Free: 8857MB / 15360MB | Utilization  41%
Testing
GPU 0 ... Mem Free: 88