## Graph Classification 

In this tutorial, we have multiple graphs and each graph has a label. 

This is different from the previous cases where we only have one graph on which we classify nodes or predict edges. 

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F


  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}
Using backend: pytorch


In [2]:
import dgl.data

# Generate a synthetic dataset with 10000 graphs, ranging from 10 to 500 nodes.
dataset = dgl.data.GINDataset('PROTEINS', self_loop=True)

In [3]:
print('Node feature dimensionality:', dataset.dim_nfeats)
print('Number of graph categories:', dataset.gclasses)


Node feature dimensionality: 3
Number of graph categories: 2


In [4]:
print(dataset.graphs[:10])
print(len(dataset.graphs))

[Graph(num_nodes=42, num_edges=204,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), Graph(num_nodes=27, num_edges=119,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), Graph(num_nodes=10, num_edges=44,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), Graph(num_nodes=24, num_edges=116,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), Graph(num_nodes=11, num_edges=53,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), Graph(num_nodes=336, num_edges=1968,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Schem

In [5]:
print(dataset.N, dataset.n, dataset.m)

1113 43471 205559


In [6]:
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler

num_examples = len(dataset)
num_train = int(num_examples * 0.8)

train_sampler = SubsetRandomSampler(torch.arange(num_train))
test_sampler = SubsetRandomSampler(torch.arange(num_train, num_examples))

train_dataloader = GraphDataLoader(
    dataset, sampler=train_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    dataset, sampler=test_sampler, batch_size=5, drop_last=False)

## A Batched Graph in DGL 

Note that the batched graph is also a graph.

> The single bigger batched graph merges all original graphs as separately connected components, with the node and edge features concatenated. 

In [7]:
it = iter(train_dataloader)
batch = next(it)
print(batch)

[Graph(num_nodes=207, num_edges=993,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={}), tensor([0, 0, 1, 0, 0])]


In [8]:
batched_graph, labels = batch

In [9]:
print('Number of nodes for each graph element in the batch:', batched_graph.batch_num_nodes())
print('Number of edges for each graph element in the batch:', batched_graph.batch_num_edges())


Number of nodes for each graph element in the batch: tensor([31, 59, 29, 30, 58])
Number of edges for each graph element in the batch: tensor([145, 299, 119, 140, 290])


In [10]:
type(batched_graph)

dgl.heterograph.DGLHeteroGraph

In [11]:
batched_graph.num_nodes()

207

In [12]:
batched_graph.num_edges()

993

In [13]:
batched_graph.to('cuda')

Graph(num_nodes=207, num_edges=993,
      ndata_schemes={'attr': Scheme(shape=(3,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})

In [15]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)
    
    def forward(self, g, in_feats):
        h = self.conv1(g, in_feats)
        h = F.relu(h)
        h = self.conv2(g, h)
        g.ndata['h'] = h
        # readout functions
        return dgl.mean_nodes(g, 'h')

## Use CPU  

In [22]:
import time 

In [38]:
model = GCN(dataset.dim_nfeats, 16, dataset.gclasses)

In [39]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [40]:
total_time = 0
for e in range(20):
    loss_sum = 0
    cnt = 0
    start = time.time()
    for batched_graph, labels in train_dataloader:
        pred = model(batched_graph, batched_graph.ndata['attr'].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        cnt += 1
    end = time.time()
    total_time += end - start
    print('In epoch {}, avg loss: {}, time: {}'.format(e, loss_sum / cnt, end - start))
print("total time: {}".format(total_time))

In epoch 0, avg loss: 0.5712791522232334, time: 3.6506245136260986
In epoch 1, avg loss: 0.5572090971838223, time: 3.6523826122283936
In epoch 2, avg loss: 0.5570591521564494, time: 3.1778838634490967
In epoch 3, avg loss: 0.5502027241366633, time: 3.182821273803711
In epoch 4, avg loss: 0.5513400771142392, time: 3.1739282608032227
In epoch 5, avg loss: 0.5550210161992674, time: 3.1523001194000244
In epoch 6, avg loss: 0.5476839827854981, time: 3.1757209300994873
In epoch 7, avg loss: 0.5400520978348978, time: 3.1840100288391113
In epoch 8, avg loss: 0.5380910696943154, time: 3.239217519760132
In epoch 9, avg loss: 0.5349907669960783, time: 3.2607035636901855
In epoch 10, avg loss: 0.537876540104325, time: 3.1945688724517822
In epoch 11, avg loss: 0.5291643884409679, time: 3.244192361831665
In epoch 12, avg loss: 0.5311357128235061, time: 3.20363450050354
In epoch 13, avg loss: 0.5289210235134939, time: 3.2367265224456787
In epoch 14, avg loss: 0.5294047581094704, time: 3.1476233005523

In [42]:
num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    pred = model(batched_graph, batched_graph.ndata['attr'].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

Test accuracy: 0.13901345291479822


## Use GPU

In [43]:
model = GCN(dataset.dim_nfeats, 16, dataset.gclasses)
model.to('cuda')

GCN(
  (conv1): GraphConv(in=3, out=16, normalization=both, activation=None)
  (conv2): GraphConv(in=16, out=2, normalization=both, activation=None)
)

In [44]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [45]:
total_time = 0
for e in range(20):
    loss_sum = 0
    cnt = 0
    start = time.time()
    for batched_graph, labels in train_dataloader:
        batched_graph = batched_graph.to('cuda')
        labels = labels.to('cuda')
        pred = model(batched_graph, batched_graph.ndata['attr'].float())
        loss = F.cross_entropy(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        cnt += 1
    end = time.time()
    total_time += end - start
    print('In epoch {}, avg loss: {}, time: {}'.format(e, loss_sum / cnt, end - start))
print("total time: {}".format(total_time))

In epoch 0, avg loss: 0.5778853478056661, time: 5.4741246700286865
In epoch 1, avg loss: 0.5752584276239524, time: 5.692477226257324
In epoch 2, avg loss: 0.5611262393466542, time: 5.706172466278076
In epoch 3, avg loss: 0.5546920242604245, time: 5.2403364181518555
In epoch 4, avg loss: 0.5554142447837284, time: 5.119494676589966
In epoch 5, avg loss: 0.5510280751780178, time: 4.993757963180542
In epoch 6, avg loss: 0.5516773439692647, time: 4.952453851699829
In epoch 7, avg loss: 0.5455471010188039, time: 4.946757793426514
In epoch 8, avg loss: 0.5464826703406451, time: 4.957819938659668
In epoch 9, avg loss: 0.5492165591777041, time: 4.9912919998168945
In epoch 10, avg loss: 0.5424639292982187, time: 4.966830253601074
In epoch 11, avg loss: 0.5435060687949148, time: 4.977870464324951
In epoch 12, avg loss: 0.5363271608948708, time: 4.969276189804077
In epoch 13, avg loss: 0.5377261888110236, time: 5.058688640594482
In epoch 14, avg loss: 0.53321791850449, time: 5.205821990966797
In e

In [46]:
num_correct = 0
num_tests = 0
for batched_graph, labels in test_dataloader:
    batched_graph = batched_graph.to('cuda')
    labels = labels.to('cuda')
    pred = model(batched_graph, batched_graph.ndata['attr'].float())
    num_correct += (pred.argmax(1) == labels).sum().item()
    num_tests += len(labels)

print('Test accuracy:', num_correct / num_tests)

Test accuracy: 0.29596412556053814
