## **Pytorch-Geometric Benchmarks**
- More deeply implements about default benchmark graph datasets of the pytorch-geometric

In [1]:
import torch
import numpy as np
from torch_geometric.data import Data

## **TUDataset**
- A variety of graph kernel benchmark datasets, .e.g. **“IMDB-BINARY”**, **“REDDIT-BINARY”** or **“PROTEINS”**, collected from the TU Dortmund University. 
- This dataset wrapper provides cleaned dataset versions as motivated by the “Understanding Isomorphism Bias in Graph Data Sets” paper, **containing only non-isomorphic graphs**.
- In this project, we will use the **PROTEINS** dataset for ***graph classification***

In [2]:
from torch_geometric.datasets import TUDataset

In [3]:
dataset_PROTEINS_full = TUDataset(root='/tmp/PROTEINS_full', name='PROTEINS_full', use_node_attr=True, use_edge_attr=True)
print(dataset_PROTEINS_full)
print(dataset_PROTEINS_full[0])

PROTEINS_full(1113)
Data(edge_index=[2, 162], x=[42, 32], y=[1])


#### *Original Source Code of the Dataset*

In [4]:
import os
import os.path as osp
import shutil
import torch
from torch_geometric.data import InMemoryDataset, download_url, extract_zip, DataLoader
from torch_geometric.io import read_tu_data

class TUDataset(InMemoryDataset):
    r"""A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
    "REDDIT-BINARY" or "PROTEINS", collected from the `TU Dortmund University
    <https://chrsmrrs.github.io/datasets>`_.
    In addition, this dataset wrapper provides `cleaned dataset versions
    <https://github.com/nd7141/graph_datasets>`_ as motivated by the
    `"Understanding Isomorphism Bias in Graph Data Sets"
    <https://arxiv.org/abs/1910.12091>`_ paper, containing only non-isomorphic
    graphs.

    .. note::
        Some datasets may not come with any node labels.
        You can then either make use of the argument :obj:`use_node_attr`
        to load additional continuous node attributes (if present) or provide
        synthetic node features using transforms such as
        like :class:`torch_geometric.transforms.Constant` or
        :class:`torch_geometric.transforms.OneHotDegree`.

    Args:
        root (string): Root directory where the dataset should be saved.
        name (string): The `name
            <https://chrsmrrs.github.io/datasets/docs/datasets/>`_ of the
            dataset.
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
        pre_filter (callable, optional): A function that takes in an
            :obj:`torch_geometric.data.Data` object and returns a boolean
            value, indicating whether the data object should be included in the
            final dataset. (default: :obj:`None`)
        use_node_attr (bool, optional): If :obj:`True`, the dataset will
            contain additional continuous node attributes (if present).
            (default: :obj:`False`)
        use_edge_attr (bool, optional): If :obj:`True`, the dataset will
            contain additional continuous edge attributes (if present).
            (default: :obj:`False`)
        cleaned: (bool, optional): If :obj:`True`, the dataset will
            contain only non-isomorphic graphs. (default: :obj:`False`)
    """

    url = ('http://ls11-www.cs.tu-dortmund.de/people/morris/'
           'graphkerneldatasets')
    cleaned_url = ('https://raw.githubusercontent.com/nd7141/'
                   'graph_datasets/master/datasets')

    def __init__(self, root, name, transform=None, pre_transform=None,
                 pre_filter=None, use_node_attr=False, use_edge_attr=False,
                 cleaned=False):
        self.name = name
        self.cleaned = cleaned
        super(TUDataset, self).__init__(root, transform, pre_transform,
                                        pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        if self.data.x is not None and not use_node_attr:
            num_node_attributes = self.num_node_attributes
            self.data.x = self.data.x[:, num_node_attributes:]
        if self.data.edge_attr is not None and not use_edge_attr:
            num_edge_attributes = self.num_edge_attributes
            self.data.edge_attr = self.data.edge_attr[:, num_edge_attributes:]

    @property
    def raw_dir(self):
        name = 'raw{}'.format('_cleaned' if self.cleaned else '')
        return osp.join(self.root, self.name, name)

    @property
    def processed_dir(self):
        name = 'processed{}'.format('_cleaned' if self.cleaned else '')
        return osp.join(self.root, self.name, name)

    @property
    def num_node_labels(self):
        if self.data.x is None:
            return 0
        for i in range(self.data.x.size(1)):
            x = self.data.x[:, i:]
            if ((x == 0) | (x == 1)).all() and (x.sum(dim=1) == 1).all():
                return self.data.x.size(1) - i
        return 0

    @property
    def num_node_attributes(self):
        if self.data.x is None:
            return 0
        return self.data.x.size(1) - self.num_node_labels

    @property
    def num_edge_labels(self):
        if self.data.edge_attr is None:
            return 0
        for i in range(self.data.edge_attr.size(1)):
            if self.data.edge_attr[:, i:].sum() == self.data.edge_attr.size(0):
                return self.data.edge_attr.size(1) - i
        return 0

    @property
    def num_edge_attributes(self):
        if self.data.edge_attr is None:
            return 0
        return self.data.edge_attr.size(1) - self.num_edge_labels

    @property
    def raw_file_names(self):
        names = ['A', 'graph_indicator']
        return ['{}_{}.txt'.format(self.name, name) for name in names]

    @property
    def processed_file_names(self):
        return 'data.pt'

    def download(self):
        url = self.cleaned_url if self.cleaned else self.url
        folder = osp.join(self.root, self.name)
        path = download_url('{}/{}.zip'.format(url, self.name), folder)
        extract_zip(path, folder)
        os.unlink(path)
        shutil.rmtree(self.raw_dir)
        os.rename(osp.join(folder, self.name), self.raw_dir)

    def process(self):
        self.data, self.slices = read_tu_data(self.raw_dir, self.name)

        if self.pre_filter is not None:
            data_list = [self.get(idx) for idx in range(len(self))]
            data_list = [data for data in data_list if self.pre_filter(data)]
            self.data, self.slices = self.collate(data_list)

        if self.pre_transform is not None:
            data_list = [self.get(idx) for idx in range(len(self))]
            data_list = [self.pre_transform(data) for data in data_list]
            self.data, self.slices = self.collate(data_list)

        torch.save((self.data, self.slices), self.processed_paths[0])

    def __repr__(self):
        return '{}({})'.format(self.name, len(self))

In [5]:
dataset_PROTEINS_full = TUDataset(root='/tmp/PROTEINS_full', name='PROTEINS_full', use_node_attr=True, use_edge_attr=True)
print(dataset_PROTEINS_full)

PROTEINS_full(1113)


In [6]:
print(dataset_PROTEINS_full.num_node_features)

32


In [7]:
def init_train_test_mask(N, ratio=0.2):
    mask_idx = np.arange(N)
    num_train = int(N*ratio)
    num_test  = N - num_train
    np.random.shuffle(mask_idx)
    train_idx = mask_idx[:num_train]
    test_idx  = mask_idx[num_train:]
    train_mask, test_mask = np.zeros(N), np.zeros(N)
    train_mask[train_idx] = 1
    test_mask[test_idx] = 1
    train_mask = torch.tensor(train_mask, dtype=torch.bool)
    test_mask = torch.tensor(test_mask, dtype=torch.bool)
    print("- number of trains : {} / number of tests : {}".format(num_train, num_test))
    return train_mask, test_mask

train_mask, test_mask = init_train_test_mask(N=len(dataset_PROTEINS_full), ratio=0.7)

- number of trains : 779 / number of tests : 334


In [8]:
train_dataset = dataset_PROTEINS_full[train_mask]
test_dataset  = dataset_PROTEINS_full[test_mask]

In [9]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

#### *GNN model initialization - GCN* 

In [10]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_max_pool

class Net(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(Net, self).__init__()
        self.hidden_dims = 256
        self.conv1 = GCNConv(num_node_features, self.hidden_dims)
        self.conv2 = GCNConv(self.hidden_dims, self.hidden_dims//2)
        self.fc1 = torch.nn.Linear(self.hidden_dims//2, self.hidden_dims//2)
        self.fc2 = torch.nn.Linear(self.hidden_dims//2, num_classes)
        
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = global_max_pool(x, batch)
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        
        return F.log_softmax(x, dim=1)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(num_node_features = dataset_PROTEINS_full.num_node_features, 
            num_classes       = dataset_PROTEINS_full.num_classes).to(device)
print(model)

Net(
  (conv1): GCNConv(32, 256)
  (conv2): GCNConv(256, 128)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


#### *Prepare the model training*

In [11]:
optimizer  = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0.0005
)


#### *Model training & evaluation*

In [12]:
EPOCHS = 100
for epoch in range(EPOCHS):
    train_losses, test_losses = [], []
    model.train()
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        out = model(data.to(device))
        loss = F.nll_loss(out, data.y)
        train_losses.append(loss)
        loss.backward()
        optimizer.step()
    
    num_corrects = 0
    num_test_graphs = 0
    for i, data in enumerate(test_loader):
        _, pred = model(data.to(device)).max(dim=1)
        correct = float(pred.eq(data.y).sum().item())
        num_test_graphs += len(data.y)
        num_corrects += correct
    test_acc = num_corrects / num_test_graphs
    
    print("- Epoch {} / train loss : {} / Test acc : {}".format(epoch, sum(train_losses), test_acc))
    
    

- Epoch 0 / train loss : 194.36953735351562 / Test acc : 0.6059050064184852
- Epoch 1 / train loss : 51.86308670043945 / Test acc : 0.6033376123234917
- Epoch 2 / train loss : 22.777057647705078 / Test acc : 0.6623876765083441
- Epoch 3 / train loss : 16.84926986694336 / Test acc : 0.631578947368421
- Epoch 4 / train loss : 16.403932571411133 / Test acc : 0.675224646983312
- Epoch 5 / train loss : 16.57914924621582 / Test acc : 0.668806161745828
- Epoch 6 / train loss : 16.370304107666016 / Test acc : 0.6713735558408216
- Epoch 7 / train loss : 15.296363830566406 / Test acc : 0.6854942233632862
- Epoch 8 / train loss : 16.09405517578125 / Test acc : 0.686777920410783
- Epoch 9 / train loss : 15.506176948547363 / Test acc : 0.693196405648267
- Epoch 10 / train loss : 15.445551872253418 / Test acc : 0.6944801026957638
- Epoch 11 / train loss : 15.12290096282959 / Test acc : 0.7021822849807445
- Epoch 12 / train loss : 14.959329605102539 / Test acc : 0.7073170731707317
- Epoch 13 / train 