# Graph Neural Network Basic 

Graph Neural Network (GNN) is a generalisation of Deep Neural Network (DNN) on graph-strucutred. In this session, we will explain common graph neural networks such as GCN and GAT, and how to use graph neural networks to accomplish node classification and graph classification problems.

## 1. GCN

### 1.1 GCN in Matrix form 

Please what you have learned in each layer of the GCN. Given a graph with an adjacency matrix $A$ and a node feature matrix $X$, if we use $W$ to denote the parameters in the GCN layer and $H$ to denote the features in the hidden layer, then the operation in a layer of GCN can be expressed as $H=\text{ReLU}(AXW)$.

In [1]:
# please import toolkits 
import torch.nn as nn
import torch.nn.functional as F
import math
import torch
import torch.optim as optim
from torch.nn.parameter import Parameter
from sklearn.metrics import f1_score
import scipy.sparse as sp
import numpy as np

In [77]:
class GraphConvolution(nn.Module):
    """GCN layer，Refer to https://github.com/tkipf/pygcn
    
    Params
    ----------
    in_features : dim of input feature
    out_features : output channel of feature (the same as out_channel in CNN)
    with_bias: None
    """

    def __init__(self, in_features, out_features, with_bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features)) # Create weight matrix in shape (in_features, out_features)
        if with_bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self): 
        """Initializing your weight params through any distribution"""
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        """Forward"""
        # Your Code
        if x.data.is_sparse:
            # sparse matrix 
            support = torch.spmm(x, self.weight) # XW
        else:
            support = torch.mm(x, self.weight) # XW
        output = torch.spmm(adj, support) # AXW
        if self.bias is not None:
            return output + self.bias # AXW + b
        else:
            return output # AXW

In [78]:
class GCN(nn.Module):
    """ two layers GCN
    
    Params
    ----------
    nfeat : input dim of feature
    nhid : dim of hidden neuron
    nclass : number of classes
    dropout : dropout probability (less than 0.3)
    with_bias: None
    """

    def __init__(self, nfeat, nhid, nclass, dropout=0.5, with_bias=True):

        super(GCN, self).__init__()

        self.nfeat = nfeat
        self.hidden_sizes = [nhid]
        self.nclass = nclass
        self.gc1 = GraphConvolution(nfeat, nhid, with_bias=with_bias)
        self.gc2 = GraphConvolution(nhid, nclass, with_bias=with_bias)
        self.dropout = dropout

    def forward(self, x, adj):
        # The implementation of GCN is the same as CNN
        # Your architecture should be input -> gc1 -> Act -> dropout -> gc2 -> softmax/logsoftmax
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)
    
    def initialize(self):
        """initializing """
        self.gc1.reset_parameters()
        self.gc2.reset_parameters()

## Node Classification using GCN

In [95]:
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_scipy_sparse_matrix
dataset = Planetoid(root='./data', name='Cora') 
data = dataset[0]
adj = to_scipy_sparse_matrix(data.edge_index)
features = data.x
labels = data.y   
labels.max()
adj.row
adj.col
adj.data

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

Normalize： 

$A \leftarrow A + I$

$\hat{A}= D^{-1/2}{A}D^{-1/2}$ 

Laplacian Matrix: 
$L = D^{-\frac12}(D-A)D^{-\frac12} = I - D^{-\frac12}AD^{-\frac12}$  

$h=Lf=(D-A)f=Df-Af$ 

$h[i]=\sum_{v_j \in \mathcal{N}(v_i)}(f[i]-f[j])$

In [72]:
def normalize_adj(mx):
    """Standarize：A' = (D + I)^-1/2 * ( A + I ) * (D + I)^-1/2
    """
    mx = mx + sp.eye(mx.shape[0])
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1/2).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    mx = mx.dot(r_mat_inv)
    return mx

In [73]:
adj_norm = normalize_adj(adj)
adj_norm.shape 

(2708, 2708)

In [74]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """transform scipy.sparse into torch 's sparse tensor"""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    sparserow=torch.LongTensor(sparse_mx.row).unsqueeze(1)
    sparsecol=torch.LongTensor(sparse_mx.col).unsqueeze(1)
    sparseconcat=torch.cat((sparserow, sparsecol),1)
    sparsedata=torch.FloatTensor(sparse_mx.data)
    return torch.sparse.FloatTensor(sparseconcat.t(),sparsedata,torch.Size(sparse_mx.shape))

In [75]:
adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)

Test your implemetation with one example, good luck!

In [80]:
nclass = labels.max().item()+1
model = GCN(nfeat=features.shape[1], nhid=16, nclass=nclass)
output = model(features, adj_norm)
print(output.shape)
print(output)

torch.Size([2708, 7])
tensor([[-1.6163, -2.1519, -2.2419,  ..., -1.7424, -1.9223, -2.1807],
        [-1.3507, -2.2947, -2.0696,  ..., -1.7330, -1.8976, -2.1724],
        [-1.4260, -2.2138, -2.4253,  ..., -1.6594, -1.8275, -2.3899],
        ...,
        [-1.5953, -1.5008, -2.6109,  ..., -2.5341, -1.6702, -2.3943],
        [-1.7057, -2.1982, -2.1837,  ..., -1.9552, -1.8260, -2.2074],
        [-1.7169, -2.2505, -2.1469,  ..., -1.9051, -1.8991, -2.0919]],
       grad_fn=<LogSoftmaxBackward0>)


### 1.2 GCN in PyTorch Geometric 
Compare with matrix form GCN, adj is equivalent to edge_index and edge_weight while node features is called x.


In [81]:
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    """ Two Layers' GCN
    
    Params
    ----------
    nfeat : dimension of input 
    nhid : dim of hidden neurons
    nclass : number of classes ground truth
    dropout : dropout probability
    with_bias: None
    """

    def __init__(self, nfeat, nhid, nclass, dropout=0.5, with_bias=True):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(nfeat, nhid, bias=with_bias, activation=F.relu)
        self.conv2 = GCNConv(nhid, nclass, bias=with_bias)
        self.dropout = dropout
        
    def reset_parameters(self):
        """Initialize"""
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, data, features=None):
        """Forward Your code"""
        x, edge_index = data.x, data.edge_index 
        x = F.relu(self.conv1(x, edge_index)) 
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [82]:
nclass = labels.max().item()+1
model = GCN(nfeat=features.shape[1], nhid=16, nclass=nclass)
output = model(data)
print(output)
print(output.shape)

tensor([[-1.8455, -1.9122, -2.0024,  ..., -1.9261, -1.8196, -2.0608],
        [-1.9529, -1.8418, -2.0281,  ..., -1.9211, -1.8361, -2.0128],
        [-1.9458, -1.9027, -2.0004,  ..., -1.9212, -1.8071, -2.0033],
        ...,
        [-1.9190, -1.9139, -2.0323,  ..., -1.8697, -1.8866, -2.0042],
        [-2.0191, -1.9901, -2.0172,  ..., -1.7597, -1.9056, -1.9327],
        [-1.9957, -1.9738, -2.0045,  ..., -1.7931, -1.9316, -1.9390]],
       grad_fn=<LogSoftmaxBackward0>)
torch.Size([2708, 7])


## GCN Variant

\begin{equation}
e_{ij} = a(\mathbf{W_l}\overrightarrow{h_i}, \mathbf{W_r} \overrightarrow{h_j})
\end{equation}

Masked Attention:
softmax function:
\begin{equation}
\alpha_{ij} = \text{softmax}_j(e_{ij}) = \frac{\exp(e_{ij})}{\sum_{k \in \mathcal{N}_i} \exp(e_{ik})}
\end{equation}

\begin{equation}
\alpha_{ij} = \frac{\exp\Big(\text{LeakyReLU}\Big(\overrightarrow{a_l}^T \mathbf{W_l} \overrightarrow{h_i} + \overrightarrow{a_r}^T\mathbf{W_r}\overrightarrow{h_j}\Big)\Big)}{\sum_{k\in \mathcal{N}_i} \exp\Big(\text{LeakyReLU}\Big(\overrightarrow{a_l}^T \mathbf{W_l} \overrightarrow{h_i} + \overrightarrow{a_r}^T\mathbf{W_r}\overrightarrow{h_k}\Big)\Big)}
\end{equation}

Now, we use the normalized attention coefficients to compute a linear combination of the features corresponding to them. These aggregated features will serve as the final output features for every node.

\begin{equation}
h_i' = \sum_{j \in \mathcal{N}_i} \alpha_{ij} \mathbf{W_r} \overrightarrow{h_j}.
\end{equation}

In [83]:
from torch_geometric.nn import GATConv
class GAT(nn.Module):
    """ Two layers' GAT.
    
    Params
    ----------
    nfeat : dim of input features
    nhid : dim of hidden neurons
    nclass : output of classes
    heads: number of head in attention mechanism
    output_heads: output head
    dropout : dropout probability
    with_bias: with or without bias
    """

    def __init__(self, nfeat, nhid, nclass, heads=8, output_heads=1, dropout=0.5, with_bias=True):

        super(GAT, self).__init__()

        self.conv1 = GATConv(
            nfeat,
            nhid,
            heads=heads,
            dropout=dropout,
            bias=with_bias)

        self.conv2 = GATConv(
            nhid * heads,
            nclass,
            heads=output_heads,
            concat=False,
            dropout=dropout,
            bias=with_bias)

        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

    def initialize(self):
        """Initialize
        """
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

In [37]:
gat = GAT(nfeat=features.shape[1],
      nhid=8, heads=8,
      nclass=nclass)
output = gat(data)
print(output)
print(output.shape)

tensor([[-1.8640, -1.7439, -1.7616, -1.7624, -1.8578, -1.7678],
        [-1.8386, -1.7945, -1.6146, -1.9597, -1.6796, -1.9076],
        [-1.5514, -2.1524, -1.6537, -1.8335, -2.0381, -1.6588],
        ...,
        [-1.8890, -1.2048, -2.4622, -2.2169, -1.7755, -1.6851],
        [-1.9284, -1.6651, -1.8490, -1.8591, -1.7979, -1.6788],
        [-2.1232, -1.3792, -1.9038, -1.9034, -1.7307, -1.8750]],
       grad_fn=<LogSoftmaxBackward0>)
torch.Size([2708, 6])


In [38]:
def train(model, data, lr=0.01, weight_decay=5e-4, epochs=200):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    labels = data.y
    train_mask = data.train_mask
    best_loss_val = 100

    for i in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(data)
        # only training nodes will be used to calculate loss 
        loss = F.nll_loss(output[train_mask], labels[train_mask]) 
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print('Epoch {}, training loss: {}'.format(i, loss.item()))

@torch.no_grad()
def test(model, data):
    """Evaluate GAT performance on test set.

    """
    model.eval()
    test_mask = data.test_mask
    labels = data.y 
    output = model(data) 
    loss_test = F.nll_loss(output[test_mask], labels[test_mask])
    preds = output[test_mask].argmax(1) 
    acc_test = preds.eq(labels[test_mask]).cpu().numpy().mean() 
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))
    return preds, output, acc_test.item()

In [17]:
model = GCN(nfeat=features.shape[1], nhid=16, nclass=nclass)
device = 'cpu' # device ='cuda'
model = model.to(device)
data = data.to(device)
train(model, data, epochs=100)

Epoch 0, training loss: 1.9424370527267456
Epoch 10, training loss: 0.7486507296562195
Epoch 20, training loss: 0.25204676389694214
Epoch 30, training loss: 0.10805542021989822
Epoch 40, training loss: 0.06834225356578827
Epoch 50, training loss: 0.044421691447496414
Epoch 60, training loss: 0.048820436000823975
Epoch 70, training loss: 0.0476372055709362
Epoch 80, training loss: 0.03384850546717644
Epoch 90, training loss: 0.029627133160829544


In [18]:
preds, output, acc = test(model, data)

Test set results: loss= 0.6074 accuracy= 0.8080


# Graph Classification using GCN
In Graph Classification, each label is related to the whole graph but not the embedding vector of each node. We use pooling and a linear layer to generate this label.

## Dataset [Enzymes](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.TUDataset.html#torch_geometric.datasets.TUDataset)
In ENZYMES dataset, there are 6 classes as ground truth, 18 continuous node features and three different kinds of node types as feature matrix. 

In [19]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import TUDataset
dataset = TUDataset(root='./data/ENZYMES', name='ENZYMES', use_node_attr=True)
dataset = dataset.shuffle()
train_ratio = 0.8 
test_ratio = 0.2
train_dataset = dataset[: int(train_ratio*len(dataset))]
test_dataset = dataset[-int(test_ratio*len(dataset)):]

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Extracting data/ENZYMES/ENZYMES/ENZYMES.zip
Processing...
Done!


In [20]:
dataset[0]

Data(edge_index=[2, 50], x=[14, 21], y=[1])

In [21]:
from torch_geometric.nn import global_mean_pool

class GCN(torch.nn.Module):
    """ 3 layers' GCN + Linar layers"""

    def __init__(self, nfeat, nhid, nclass, dropout=0.2):
        super(GCN, self).__init__()
        
        self.gc1 = GCNConv(nfeat, nhid)
        self.gc2 = GCNConv(nhid, nhid)
        self.gc3 = GCNConv(nhid, nhid)
        self.lin = nn.Linear(nhid, nclass) # output of the whole graph
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.gc1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.relu(self.gc2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training) 
        x = self.gc3(x, edge_index)
        x = global_mean_pool(x, batch=data.batch) 
        x = self.lin(x) 
        return F.log_softmax(x, dim=1)

In [22]:
nfeat = dataset.num_node_features
nclass = dataset.num_classes
nhid = 64
device = 'cpu'
model = GCN(nfeat, nhid, nclass).to(device)

In [23]:
def train(model, lr=0.001, epochs=1000):
    """train loop"""
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        loss_all = 0
        for data in (train_loader): 
            data = data.to(device)
            optimizer.zero_grad()
            y_model = model(data)
            loss = F.nll_loss(y_model, data.y)
            loss.backward()
            loss_all += loss.item() * data.num_graphs
            optimizer.step()
        loss_train = loss_all / len(train_loader.dataset) 
        if epoch % 100 == 0:
            print('Epoch: {:03d}, Loss: {:.7f}'.format(epoch, loss_train))

@torch.no_grad()            
def test(model, loader):
    """test loop"""
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)

        _, pred = model(data).max(dim=1)
        correct += float(pred.eq(data.y).sum().item())
    return correct / len(loader.dataset)

In [24]:
train(model, epochs=1000)

Epoch: 000, Loss: 2.0274307
Epoch: 100, Loss: 1.5137738
Epoch: 200, Loss: 1.2263177
Epoch: 300, Loss: 0.9777098
Epoch: 400, Loss: 0.7704745
Epoch: 500, Loss: 0.6072058
Epoch: 600, Loss: 0.4968923
Epoch: 700, Loss: 0.3625646
Epoch: 800, Loss: 0.3085780
Epoch: 900, Loss: 0.2164070


In [25]:
test(model, test_loader)

0.6333333333333333