# GNN Architectures

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.1.0+cu118
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import random

from tqdm import tqdm
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
import torch_geometric.nn as pyg_nn
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from functools import partial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# import pytorch_lightning as pl
# from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
# from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import torch
import torch.nn as nn
import torch_geometric.nn as pyg_nn
import torch_geometric.transforms as T
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
import time
from math import ceil

from torch_geometric.data import DenseDataLoader

## Load a dataset

We use the `Enymes` Dataset containing molecule graphs

In [None]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(name='ENZYMES', root='data/TUDataset')
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

Dataset: ENZYMES(600):
Number of graphs: 600
Number of features: 3
Number of classes: 6




In [None]:
data = dataset[0]  # Get the first (and only) graph object.

print(f"Sample graph: {data}")
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of avg. nodes: {np.mean([data.num_nodes for data in dataset]):.2f}')
print(f'Number of avg. edges: {np.mean([data.num_edges for data in dataset]):.2f}')

Sample graph: Data(edge_index=[2, 168], x=[37, 3], y=[1])
Number of avg. nodes: 32.63
Number of avg. edges: 124.27


# Mini-batching for graphs

We make use of different dataloaders implemented in PyTorch Geometric: https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.DataLoader

In [None]:
from torch_geometric.data import DataLoader

# Create a PyTorch Geometric DataLoader object for easy graph mini-batching.
BATCH_SIZE = 16
graph_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)



In [None]:
# Get a batch from the dataloader
sample_batch = next(iter(graph_dataloader))

Let's investigate the batch object we have received

In [None]:
sample_batch

DataBatch(edge_index=[2, 1884], x=[500, 3], y=[16], batch=[500], ptr=[17])

It contains a single `edge_index`, a single node feature matrix `x`,
a single target label matrix `y`, and a batch indicator matrix `batch`.

The dataloader merged all graphs into a single set of disjoint graphs.
Standard message passing operators can natively run on this representation,
because no messages are passed between the disjoint set of graphs.

This allows for efficient mini-batching and parallel processing
of different graphs, without any memory overhead of e.g. additional padding.

In [None]:
from torch_geometric.nn import SAGEConv

# Create a GraphSAGE model
conv = SAGEConv(dataset.num_features, 16)

# Run the convolution operator
out = conv(sample_batch.x, sample_batch.edge_index)

# Check the size of the output
print(f'Output size: {out.size()}')

Output size: torch.Size([500, 16])


Now we have node embeddings for all the graphs, but what if we want to
aggregate them into individual representations for each graph?

We need to make use of the `batch` indicator attribute!

In [None]:
print(f"Number of nodes: {sample_batch.num_nodes}, batch indicator matrix: {sample_batch.batch.shape}")
print(f"Batch size: {BATCH_SIZE}, unique batch indicator values: {sample_batch.batch.unique()}")

Number of nodes: 500, batch indicator matrix: torch.Size([500])
Batch size: 16, unique batch indicator values: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])


In [None]:
# this is one of the helper libraries recommended to install along pytorch geometric
import torch_scatter

# The `scatter` function supports a set of aggregations: https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html
graph_embeddings  = torch_scatter.scatter(out, sample_batch.batch, dim=0, reduce="mean")
print(f"Graph embedding shape: {graph_embeddings.shape}")

Graph embedding shape: torch.Size([16, 16])


The same but in an even simpler manner is also supported by PyG out-of-the box now:

In [None]:
from torch_geometric.nn.pool import global_mean_pool

In [None]:
graph_embeddings = global_mean_pool(out, sample_batch.batch)
print(f"Graph embedding shape: {graph_embeddings.shape}")

Graph embedding shape: torch.Size([16, 16])


3.1 Baseline GNN

We refer to the paper Design Space of GNN (Jiaxuan You et al.), where a fundamental model contains the following blocks:

1. Pre-processing MLP Layers
2. Message Passing Layers (GNN + BN + Activation + Dropout)
3. (*) Skip-connection Layers / Residual Blocks
4. Pooling Layer(s).
5. Post-processing MLP Layers

In [None]:
class basicGNN(nn.Module):
    def __init__(self, depth, method, dropout, pool, input_dim, output_dim, embed_dim, connection):
        super(basicGNN, self).__init__()
        self.depth = depth
        self.method = method
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.connection = connection

        self.convs = nn.ModuleList()
        self.preprocess_mlp = nn.ModuleList()
        self.gnn_mlp = nn.ModuleList()
        self.postprocess_mlp = nn.ModuleList()
        self.pool = pool

        for i in range(2):
            # Here the preprocess layers we refer to the paper:
            # Design Space for Graph Neural Networks, Jiaxuan You et al. NeurIPS 2021
            if i == 0:
                self.preprocess_mlp.append(nn.Sequential(
                        nn.Linear(self.input_dim, self.embed_dim),
                        nn.BatchNorm1d(self.embed_dim),
                        nn.PReLU(),
                ))
            else:
                self.preprocess_mlp.append(nn.Sequential(
                        nn.Linear(self.embed_dim, self.embed_dim),
                        nn.BatchNorm1d(self.embed_dim),
                        nn.PReLU(),
                ))

        for i in range(3):
            # Here the preprocess layers we refer to the paper:
            # Design Space for Graph Neural Networks, Jiaxuan You et al. NeurIPS 2021
            if i != 2:
                if i == 1:
                    self.postprocess_mlp.append(nn.Sequential(
                        nn.Linear(self.embed_dim, self.embed_dim),
                        nn.BatchNorm1d(self.embed_dim),
                        nn.PReLU(),
                ))
                else:
                    if self.connection == "residual":
                        self.postprocess_mlp.append(nn.Sequential(
                            nn.Linear(self.embed_dim , self.embed_dim),
                            nn.BatchNorm1d(self.embed_dim),
                            nn.PReLU(),
                    ))
                    else:
                        self.postprocess_mlp.append(nn.Sequential(
                            nn.Linear(self.embed_dim * self.depth , self.embed_dim),
                            nn.BatchNorm1d(self.embed_dim),
                            nn.PReLU(),
                        ))
            else:
                self.postprocess_mlp.append(nn.Sequential(
                            nn.Linear(self.embed_dim, self.output_dim),
                    ))
        for i in range(self.depth):
            # Here the MLP layers we also refer to the paper:
            # Design Space for Graph Neural Networks, Jiaxuan You et al. NeurIPS 2021
            # We apply ACT[DROPOUT[BN[Linear]]] in order
            # Particularly, we use the result from the paper that dropout layer should be removed
            self.gnn_mlp.append(nn.Sequential(
                        nn.Linear(self.embed_dim, self.embed_dim),
                        nn.BatchNorm1d(self.embed_dim),
                        nn.Dropout(self.dropout),
                        nn.PReLU(),
                        nn.Linear(self.embed_dim, self.embed_dim),
                        nn.BatchNorm1d(self.embed_dim),
                        nn.Dropout(self.dropout),
                        nn.PReLU(),
                ))

        for i in range(self.depth):
            # Here we only provide only four mainstream graph encoders for baseline tests:
            # GCN, GraphSAGE, GAT and GIN
            if method == "GAT":
                self.convs.append(pyg_nn.GATConv(self.embed_dim, self.embed_dim))
            elif method == "GIN":
                self.convs.append(pyg_nn.GINConv(self.gnn_mlp[i]))
            elif method == "GraphSAGE":
                self.convs.append(pyg_nn.SAGEConv(self.embed_dim, self.embed_dim, normalize=True))
            elif method == "GCN":
                self.convs.append(pyg_nn.GCNConv(self.embed_dim, self.embed_dim))

    def forward(self, x, edge_index, batch, mask = None):
        # 1. preprocess
        self.save_results = []
        h = self.preprocess_mlp[0](x)
        h = self.preprocess_mlp[1](h)
        # 2. residual / skip-connect
        for i in range(self.depth):
            if self.method == "GIN":
                if self.connection == "residual":
                    h = h + self.convs[i](h, edge_index)
                else:
                    h = self.convs[i](h, edge_index)
                    self.save_results.append(h)
            else:
                if self.connection == "residual":
                    h = h + self.convs[i](self.gnn_mlp[i](h), edge_index)

                else:
                    h = self.convs[i](self.gnn_mlp[i](h), edge_index)
                    self.save_results.append(h)


        if self.connection == "skip":
            h= torch.cat(self.save_results, dim=1)
        # 3. pooling for graph classification

        if self.pool == "mean":
            h = pyg_nn.global_mean_pool(h, batch)
        elif self.pool == "max":
            h = pyg_nn.global_max_pool(h, batch)

        # 4. postprocess
        h = self.postprocess_mlp[0](h)
        h = self.postprocess_mlp[1](h) # output
        h = self.postprocess_mlp[2](h) # output
        out = F.log_softmax(h, dim =1)

        return out



In [None]:
def shuffle(dataset, seed):
    torch.manual_seed(seed)
    return dataset.shuffle()

def train_test_val_split(num_test, batch_size, dataset):
    test_dataset = dataset[:num_test]
    val_dataset = dataset[num_test:2 * num_test]
    train_dataset = dataset[2 * num_test:]
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    return test_loader, val_loader, train_loader

def train(epoch, model, train_loader, device, optimizer):
    model.train()
    loss_all = 0
    for data in train_loader:
        model.train()
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = F.nll_loss(output, data.y.view(-1))
        loss.backward()
        loss_all += loss.item() * len(data.y)
        optimizer.step()
    return loss_all / len(train_loader.dataset)

@torch.no_grad()
def test(loader, model, device, ):
    correct = 0
    model.eval()
    for data in loader:
        data = data.to(device)
        pred = model(data.x, data.edge_index, data.batch).max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)


In [None]:
seed = [0]*5 + [42]*5 + [418004]*5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ------------ hyperparameters --------------- #
modelname = "GraphSAGE"
depth = 2
dropout = 0.15
aggr = "mean"
embed_dim = 128
connection = "skip"
lr = 0.002
weight_decay = 5e-4
batch_size = 64
epochs = 801
num_test = 100 # 100 for test
test_acc_ = []
# ------------- begin training session  -----------------#
# Shuffle dataset three times and train each dataset 5 times. then take average of them (15)
for i in range(15):
    dataset = shuffle(TUDataset(name='ENZYMES', root='data/TUDataset'), seed[i])
    torch.manual_seed(seed=seed[i])
    model = basicGNN(depth, modelname, dropout, aggr, dataset.num_features,
                      dataset.num_classes, embed_dim, connection).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr = lr, weight_decay=weight_decay)
    test_loader, val_loader, train_loader = train_test_val_split(num_test, batch_size, dataset)
    best_val_acc = test_acc = 0
    times = []
    for epoch in range(epochs):
        train_loss = train(epoch, model, train_loader, device, optimizer)
        val_acc = test(val_loader, model, device)
        if val_acc > best_val_acc:
            test_acc = test(test_loader, model, device)
            best_val_acc = val_acc
        # if epoch % 100 == 0:
        #     print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, '
        #         f'Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {test_acc:.4f}')
    test_acc_.append(test_acc)
    print("for seed i: {}, model {} has the best test accuracy: {}".format(seed[i], modelname, test_acc))

print("average accuracy {} with {} for method {}, with seeds {}".format(np.mean(test_acc_), np.std(test_acc_), modelname, seed))




for seed i: 0, model GraphSAGE has the best test accuracy: 0.63
for seed i: 0, model GraphSAGE has the best test accuracy: 0.58
for seed i: 0, model GraphSAGE has the best test accuracy: 0.62
for seed i: 0, model GraphSAGE has the best test accuracy: 0.6
for seed i: 0, model GraphSAGE has the best test accuracy: 0.61
for seed i: 42, model GraphSAGE has the best test accuracy: 0.48
for seed i: 42, model GraphSAGE has the best test accuracy: 0.54
for seed i: 42, model GraphSAGE has the best test accuracy: 0.56
for seed i: 42, model GraphSAGE has the best test accuracy: 0.56
for seed i: 42, model GraphSAGE has the best test accuracy: 0.5
for seed i: 418004, model GraphSAGE has the best test accuracy: 0.5
for seed i: 418004, model GraphSAGE has the best test accuracy: 0.51
for seed i: 418004, model GraphSAGE has the best test accuracy: 0.52
for seed i: 418004, model GraphSAGE has the best test accuracy: 0.53
for seed i: 418004, model GraphSAGE has the best test accuracy: 0.55
average accur

3.2 Improvement on pooling

Diffpool (Rex et al.) 2018 KDD states that a connection of hierarchical pooling layers will lead to better performance with GraphSAGE.

Training Graph Network with Hierarchical DiffPool

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,
                 normalize=False, lin=True):
        super().__init__()
        self.conv1 = pyg_nn.DenseSAGEConv(in_channels, hidden_channels, normalize)
        self.conv2 = pyg_nn.DenseSAGEConv(hidden_channels, hidden_channels, normalize)
        self.conv3 = pyg_nn.DenseSAGEConv(hidden_channels, out_channels, normalize)
        self.lin = torch.nn.Linear(2 * hidden_channels + out_channels,
                                       out_channels) if lin is True else None


    def bn(self, i, x):
        batch_size, num_nodes, num_channels = x.size()

        x = x.view(-1, num_channels)
        x = getattr(self, f'bn{i}')(x)
        x = x.view(batch_size, num_nodes, num_channels)
        return x

    def forward(self, x, adj, mask=None):
        x0 = x
        x1 =(self.conv1(x0, adj, mask).relu())
        x2 =(self.conv2(x1, adj, mask).relu())
        x3 =(self.conv3(x2, adj, mask).relu())
        x = torch.cat([x1, x2, x3], dim=-1)
        if self.lin is not None:
            x = self.lin(x).relu()
        return x

class DiffPoolGNN(torch.nn.Module):
    def __init__(self, assign_ratio, embed_dim, output_dim, max_nodes):
        super().__init__()
        num_nodes = ceil(assign_ratio * max_nodes)
        self.gnn1_pool = GNN(dataset.num_features, embed_dim, num_nodes)
        self.gnn1_embed = GNN(dataset.num_features, embed_dim, embed_dim, lin=False)
        num_nodes = ceil(assign_ratio * num_nodes)
        self.gnn2_pool = GNN(3 * embed_dim, embed_dim, num_nodes)
        self.gnn2_embed = GNN(3 * embed_dim, embed_dim, embed_dim, lin=False)
        self.gnn3_pool = GNN(3 * embed_dim, embed_dim, num_nodes)
        self.gnn3_embed = GNN(3 * embed_dim, embed_dim, embed_dim, lin=False)
        self.gnn4_embed = GNN(3 * embed_dim, embed_dim, embed_dim, lin=False)
        self.lin1 = torch.nn.Linear(3 * embed_dim, embed_dim)
        self.lin2 = torch.nn.Linear(embed_dim, output_dim)

    def forward(self, x, adj, mask=None):
        s = self.gnn1_pool(x, adj, mask)
        x = self.gnn1_embed(x, adj, mask)
        x, adj, l1, e1 = pyg_nn.dense_diff_pool(x, adj, s, mask)
        s = self.gnn2_pool(x, adj)
        x = self.gnn2_embed(x, adj)
        x, adj, l2, e2 = pyg_nn.dense_diff_pool(x, adj, s)
        # s = self.gnn3_pool(x, adj)
        x = self.gnn3_embed(x, adj)
        x = x.mean(dim=1)
        x = self.lin1(x).relu()
        x = self.lin2(x)
        return F.log_softmax(x, dim=-1), l1 + l2, e1 + e2

In [None]:
def train_test_val_split(num_test, batch_size, dataset):
    test_dataset = dataset[:num_test]
    val_dataset = dataset[num_test: int(2.5 * num_test)]
    train_dataset = dataset[int(2.5 * num_test):]
    test_loader = DenseDataLoader(test_dataset, batch_size=batch_size)
    val_loader = DenseDataLoader(val_dataset, batch_size=batch_size)
    train_loader = DenseDataLoader(train_dataset, batch_size=batch_size)
    return test_loader, val_loader, train_loader

def train(epoch, model, train_loader, device, optimizer):
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad(set_to_none=True)
        output, _, _ = model(data.x, data.adj, data.mask)
        loss = F.nll_loss(output, data.y.view(-1))
        loss.backward()
        loss_all += data.y.size(0) * float(loss)
        optimizer.step()
    return loss_all / len(train_loader.dataset)
    # return loss_all / len(train_dataset)

@torch.no_grad()
def test(loader, model, device):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        pred = model(data.x, data.adj, data.mask)[0].max(dim=1)[1]
        correct += int(pred.eq(data.y.view(-1)).sum())
    return correct / len(loader.dataset)


In [None]:

from math import ceil

max_nodes = 300
dataset = TUDataset(
    # path,
    name='ENZYMES',
    root='data/TUDataset',
    transform=T.ToDense(max_nodes),
    pre_filter=lambda data: data.num_nodes <= max_nodes,
)

num_test = 100
assign_ratio = 0.25
embed_dim = 64
output_dim = dataset.num_classes
batch_size = 25
seed = [1, 42, 418004] #  OK, 1, 42, 418004 many other seeds also suitable
lr = 0.001
epochs = 450
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
test_acc_= []

for i in seed:
    torch.manual_seed(i)
    dataset_ = dataset.shuffle()
    test_loader, val_loader, train_loader = train_test_val_split(num_test, batch_size, dataset_)
    model = DiffPoolGNN(assign_ratio, embed_dim, output_dim, max_nodes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = test_acc = 0
    times = []
    for epoch in range(1, epochs):
        train_loss = train(epoch, model, train_loader, device, optimizer)
        val_acc = test(val_loader, model, device)
        if val_acc > best_val_acc:
            test_acc = test(test_loader, model, device)
            best_val_acc = val_acc
        if epoch % 50 == 0:
            print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, '
            f'Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {test_acc:.4f}')
    test_acc_.append(test_acc)
    print("for seed i: {}, model has the best test accuracy: {}".format(i, test_acc))

print("average accuracy {} with {} for diffpool, with seeds {}".format(np.mean(test_acc_), np.std(test_acc_), seed))




cuda
Epoch: 050, Train Loss: 1.0482, Best Val Acc: 0.4600, Best Test Acc: 0.4200
Epoch: 100, Train Loss: 0.4037, Best Val Acc: 0.4800, Best Test Acc: 0.4800
Epoch: 150, Train Loss: 0.2096, Best Val Acc: 0.4867, Best Test Acc: 0.5500
Epoch: 200, Train Loss: 0.0956, Best Val Acc: 0.5133, Best Test Acc: 0.5400
Epoch: 250, Train Loss: 0.0248, Best Val Acc: 0.5200, Best Test Acc: 0.6000
Epoch: 300, Train Loss: 0.0147, Best Val Acc: 0.5267, Best Test Acc: 0.6100
Epoch: 350, Train Loss: 0.0124, Best Val Acc: 0.5267, Best Test Acc: 0.6100
Epoch: 400, Train Loss: 0.0183, Best Val Acc: 0.5267, Best Test Acc: 0.6100
for seed i: 1, model has the best test accuracy: 0.61
Epoch: 050, Train Loss: 1.2283, Best Val Acc: 0.4200, Best Test Acc: 0.4800
Epoch: 100, Train Loss: 0.9783, Best Val Acc: 0.5000, Best Test Acc: 0.5100
Epoch: 150, Train Loss: 0.3180, Best Val Acc: 0.5467, Best Test Acc: 0.6000
Epoch: 200, Train Loss: 0.2299, Best Val Acc: 0.5467, Best Test Acc: 0.6000
Epoch: 250, Train Loss: 0.006