## Graph Sample and Aggregate (GraphSAGE) 

Paper:  [Inductive Representation Learning on Large Graphs](https://www-cs-faculty.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) (NIPS 2017)

**Message Passing Perspective**

|Notion | Meaning | 
|---|---|
|$\mathcal{G}$ = $(V, E)$ | Input graph |
|$x_v$ | Node features for node $v\in V$|
|$h_v$ | Node embedding for node $v\in V$ |
|$\mathcal{N}(v)$ | Neighbours of node $v\in V$|

Initial:
$$h^{(0)}_v = x_v , \forall v \in V .$$

Aggregate:
$$h^{(l)}_{\mathcal{N}(v)} \leftarrow \text{AGG}\{h^{(l-1)}_u, \forall u\in \mathcal{N}(v)\} .$$

Update: 
$$h^{(l)}_v \leftarrow \text{ReLU}\left(W^{(l)} \cdot \text{CONCAT}\left(h^{(l-1)}_v, h^{(l)}_{\mathcal{N}(v)}\right)\right) .$$


In [1]:
import time

import torch 
import torch.nn as nn
import torch.nn.functional as F
import dgl 
import dgl.function as fn
from dgl.data import RedditDataset, PPIDataset

Using backend: pytorch


In [2]:
class GraphSAGELayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GraphSAGELayer, self).__init__()
        self.linear = nn.Linear(in_feats * 2, out_feats)
    
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
            h_N = g.ndata['h_N']
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)
        

In [3]:
# a two-layer GraphSAGE as described in the paper
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GraphSAGE, self).__init__()
        self.conv1 = GraphSAGELayer(in_feats, h_feats)
        self.conv2 = GraphSAGELayer(h_feats, num_classes)

    def forward(self, g, h):
        h = self.conv1(g, h)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [4]:
# we use the same configurations as the paper's
hidden_size = 256
lr = 1e-2
epochs = 10

In [5]:
def evaluate(model, g, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [6]:
def main(dataset, device='cuda'):
    g = dataset[0]
    features = g.ndata['feat'].to(device)
    labels = g.ndata['label'].to(device)
    train_mask = g.ndata['train_mask'].to(device)
    val_mask = g.ndata['val_mask'].to(device)
    test_mask = g.ndata['test_mask'].to(device)
    in_feats = features.shape[1]
    n_classes = dataset.num_classes
    n_edges = g.number_of_edges()
    g = g.int().to(device)
    
    print(f"#nodes: {g.number_of_nodes()}, #edges: {g.number_of_edges()}")
    
    model = GraphSAGE(in_feats, hidden_size, n_classes)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        start = time.time()
        model.train()
        # forward
        logits = model(g, features)
        loss = loss_fn(logits[train_mask], labels[train_mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        end = time.time()
        logits = model(g, features)
        val_loss = loss_fn(logits[val_mask], labels[val_mask])
        
        print("Epoch {:03d} | Time(s) {:.4f} | Train Loss {:.4f} | Val Loss {:.4f} | ".format(epoch, end - start, loss.item(), val_loss.item()))
    
    acc = evaluate(model, g, features, labels, test_mask)
    print("Test Accuracy {:.4f}".format(acc))

In [7]:
dataset = RedditDataset()
main(dataset)

#nodes: 232965, #edges: 114615892
Epoch 000 | Time(s) 2.3333 | Train Loss 3.7539 | Val Loss 2.6798 | 


RuntimeError: CUDA out of memory. Tried to allocate 456.00 MiB (GPU 0; 11.91 GiB total capacity; 4.00 GiB already allocated; 341.25 MiB free; 4.73 GiB reserved in total by PyTorch)