# we will use [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) packages to help us implement 
relational GCN for heterogeneous graphs

## Setup

In [1]:
pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch-geometric

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install networkx matplotlib

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install rdflib

Note: you may need to restart the kernel to use updated packages.


## Import Necessary Libraries
PyTorch Geometric has two classes for storing and/or transforming graphs into tensor format. One is `torch_geometric.datasets`, which contains a variety of common graph datasets. Another is `torch_geometric.data`, which provides the data handling of graphs in PyTorch tensors.

In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
from torch_geometric.datasets import Entities

## Load the Dataset
The AIFB dataset describes the AIFB research institute in terms of its staff, research group, and publications. In (Bloehdorn et al 2007) the dataset was first used to predict the affiliation (i.e., research group) for people in the dataset. 
In its standard setting, the dataset includes:
  1. A multi-relational graph structure
  2. ~150 labeled Person nodes across 4 research group classes (used for supervised classification)

In [3]:
# Load the AIFB dataset (a knowledge graph with multiple relation types)
dataset = Entities(root='/tmp/AIFB', name='AIFB')
data = dataset[0]

Processing...
Done!


## the number of classes and number of relations in the AIFB dataset

In [4]:
def basic_info(dataset, data):
    # TODO: Implement a function that takes a PyG dataset object
    # and returns the number of classes, number of features for that dataset.
    num_class = 0
    num_relations = 0

    num_nodes = data.num_nodes
    num_edges = data.num_edges
    ############# Your code here ############
    ## (~2 lines of code)
    num_class = dataset.num_classes
    num_relations = dataset.num_relations
    #########################################
    print("AIFB dataset has {} nodes".format(num_nodes))
    print("AIFB dataset has {} edges".format(num_edges))
    print("AIFB dataset has {} classes".format(num_class))
    print("AIFB dataset has {} relations".format(num_relations))
    print('Dataset attribute information:', data)
    
basic_info(dataset, data)

AIFB dataset has 8285 nodes
AIFB dataset has 58086 edges
AIFB dataset has 4 classes
AIFB dataset has 90 relations
Dataset attribute information: Data(edge_index=[2, 58086], edge_type=[58086], train_idx=[140], train_y=[140], test_idx=[36], test_y=[36], num_nodes=8285)


## Get familar with the dataset

In [6]:
# From the above exercise, we know that the number of relations is 90
# Now print out the edge type to see if the they range from 0 to 89 (90 different relation types).
print(f'edge_type attribute: {data.edge_type}') # a tensor of size 58086 (number of edges), each element is the edge type for that edge. 
print(f'edge_type attribute: {data.edge_type.shape}')

print(f'edge_type attribute (min): {min(data.edge_type)}')
print(f'edge_type attribute (max): {max(data.edge_type)}')

edge_type attribute: tensor([21,  9, 13,  ..., 13, 27, 59])
edge_type attribute: torch.Size([58086])
edge_type attribute (min): 0
edge_type attribute (max): 89


In [9]:
# When training we need to use the data.train_idx, which is the indices of the nodes we can use for training.
# train_y is the corresponding labels (4 classes, 0-3) for these nodes
print ("Training node indices:")
print (data.train_idx)
print ("Corresponding training labels (4 classes, 0-3):")
print (data.train_y)
print ("The size of training set:")
print (data.train_y.shape)
print ("=========================================")

# We can also use data.train_idx to get the corresponding edges
# This finds all edges connected to nodes in data.train_idx. 
# It checks whether the source or target node of an edge is part of the training node set.
train_in_edge_mask = torch.isin(data.edge_index[1,:], data.train_idx)
print(f"{train_in_edge_mask.sum()} in-edges to training nodes")
train_out_edge_mask = torch.isin(data.edge_index[0,:], data.train_idx)
print(f"{train_out_edge_mask.sum()} out-edges to training nodes")
 
# Each column represents one edge
print ("The corresponding in-edges to training nodes:")
print (data.edge_index[:,train_in_edge_mask])
print ("=========================================")

Training node indices:
tensor([1338, 6081, 7902, 7564, 4002, 7309, 4963, 6520, 6492,  429, 4838, 7762,
         414, 4422, 8217, 3371, 1088, 8110, 4314, 2531, 2417, 1982, 5844, 6982,
        5432, 8258, 2020, 1725, 4250, 5489, 2083, 8141,  317, 2651, 8067, 5394,
         372,   37, 6159, 7654, 7927, 7327, 5574,  286, 4101, 6645, 5061, 2194,
        5719,  494, 2778, 7813, 4745, 3127, 5329, 5290, 6885, 5473, 4195, 7086,
        3132,  240, 5284, 7648, 7590, 6133, 3300, 2597, 3523, 2189, 6400, 7887,
        4854,   31, 1920, 6244, 1769,  138, 2984, 6639, 4204, 5184, 1566, 6550,
        7819, 6971, 6619, 7274, 1413,  140, 2644, 6415, 5086,   98, 3634, 8181,
        3893, 6193, 5274,    8, 6047, 6553, 3704, 5024, 8215, 3251, 1303, 1102,
        3613, 4707, 7907,  872, 7142, 4016, 2774, 7180, 7455, 5095, 5320, 7219,
        3220, 7600, 5056, 4242, 2876, 6241, 4870, 6375,  510, 5878, 5493, 7264,
        1086, 2095, 5695, 5325, 3783, 4361, 3737, 2523])
Corresponding training labels (4 classes

## Play with the dataset with test_idx

In [10]:
print ("Testing node indices:")
#############################
# TODO: print the test set indices
# ~ 1 line of code
print (data.test_idx)
#############################

print ("Corresponding testing labels (4 classes, 0-3):")
#############################
# TODO: print the corresponding test set labels
# ~ 1 line of code
print (data.test_y)
#############################

print ("The size of training dataset:")
#############################
# TODO: print the size of the test set
# ~ 1 line of code
print (data.test_y.shape)
#############################
print ("=========================================")


print ("The corresponding edges:")
#############################

# We can also use data.test_idx to get the corresponding edges
# This finds all edges connected to nodes in data.test_idx.
# It checks whether the source or target node of an edge is part of the test node set.
test_in_edge_mask = torch.isin(data.edge_index[1, :], data.test_idx)
print(f"{test_in_edge_mask.sum()} in-edges to test nodes")

test_out_edge_mask = torch.isin(data.edge_index[0, :], data.test_idx)
print(f"{test_out_edge_mask.sum()} out-edges to test nodes")

# Each column represents one edge
print("The corresponding in-edges to test nodes:")
print(data.edge_index[:, test_in_edge_mask])
print("=========================================")

Testing node indices:
tensor([3780, 6442, 2057, 5698, 6294,  355, 1045, 5175, 3813, 7809, 6510, 7574,
          64, 3710, 1235,  888, 5799, 5817, 2750, 7477,  830, 4571,  139,  581,
        1340, 6902, 6087, 3745,  903,  759, 3635, 1999, 7563, 6556, 5013, 7899])
Corresponding testing labels (4 classes, 0-3):
tensor([1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 3, 0, 2, 2, 1, 2, 2,
        3, 3, 2, 2, 1, 2, 1, 3, 0, 1, 3, 0])
The size of training dataset:
torch.Size([36])
The corresponding edges:
1550 in-edges to test nodes
1550 out-edges to test nodes
The corresponding in-edges to test nodes:
tensor([[  27,   53,   53,  ..., 8269, 8281, 8281],
        [6087,  581,  581,  ...,  139, 6087, 6087]])


## Build RGCN model
we will implement **Relational GCN** covered in week 8.

As taught in the lecture, in an RGCN layer, each node gathers information from its neighbors—but crucially, this aggregation is performed separately for each relation type. Each relation type has its own learnable weight matrix, allowing the network to capture how different kinds of connections contribute uniquely to a node's representation. After processing messages from all relation types, these relation-specific contributions are combined (typically via summation) along with the node’s own transformed features (often using a self-loop). By stacking multiple RGCN layers, the network can aggregate multi-relational information from increasingly larger neighborhoods, enabling it to model complex interactions in heterogeneous graphs.


PyG implements this layer via [`RGCNConv`](https://pytorch-geometric.readthedocs.io/en/stable/generated/torch_geometric.nn.conv.RGCNConv.html), which can be executed by passing in the node feature representation `x`, the COO graph connectivity representation `edge_index`, and the one-dimensional relation type/index `edge_type` for each edge in `edge_index`. By passing values to `num_bases`, this layer will use the basis-decomposition regularization scheme where num_bases denotes the number of bases to use.

In [7]:
# Check for node features. AIFB typically does not provide node features.
# So we need to learn a node embeddings using just the node ids.
if data.x is None:
    print("No node features detected; using a learnable node embedding.")
    use_embedding = True
else:
    use_embedding = False  

No node features detected; using a learnable node embedding.


In [9]:
# Define the RGCN model
class RGCN(torch.nn.Module):
    def __init__(self, num_nodes, in_channels, hidden_channels, out_channels, num_relations, num_bases=10, use_embedding=False):
        super(RGCN, self).__init__()
        
        # If use embedding, use torch.nn.Embedding for learnable node embedding to map nodes into in_channels.
        self.use_embedding = use_embedding
        if use_embedding:
            self.embedding = torch.nn.Embedding(num_nodes, in_channels)
        ######################
        # TO DO:
        # 1. Define 2 RGCN layers using RGCNConv
        # You can set num_bases to use basis learning for regularization. 
        # ~ 2 lines of code
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations, num_bases=num_bases)
        self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations, num_bases=num_bases)
        ######################
    
    def forward(self, x, edge_index, edge_type):
        ######################
        # TODO:
        # Implement the function which takes in the node input x and
        # edge_index tensor and edge_type tensor and returns the prediction. 
        # Hint: Use the embedding layer and two RGCN layers defined in the above function.
        # Do not forget the activation function
        # You can also apply dropout
        if self.use_embedding:
            x = self.embedding(x)
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index, edge_type)
        ######################
        return x

## Exercise 4: Skip connection in RGCN model
In this exercise, we will implement skip connections in **Relational GCN**.

In [10]:
class RGCN2(torch.nn.Module):
    def __init__(self, num_nodes, in_channels, hidden_channels, out_channels, num_relations, num_bases=10, use_embedding=False):
        super(RGCN2, self).__init__()
        
        self.use_embedding = use_embedding
        if use_embedding:
            self.embedding = torch.nn.Embedding(num_nodes, in_channels)
        
        ######################
        # TO DO:
        # 1. Define 3 RGCN layers using RGCNConv
        # You can set num_bases to use basis learning for regularization. 
        # ~ 3 lines of code
        self.conv1 = RGCNConv(in_channels, in_channels, num_relations, num_bases=num_bases)
        self.conv2 = RGCNConv(in_channels, hidden_channels, num_relations, num_bases=num_bases)
        self.conv3 = RGCNConv(hidden_channels, out_channels, num_relations, num_bases=num_bases)
    
        ######################
    
    def forward(self, x, edge_index, edge_type):
        if self.use_embedding:
            x = self.embedding(x)
        
        ######################
        # TODO:
        # 1. Implement skip connection.
        # 2. Return the prediction output.
        
        x_input = x  # Save original input for skip connection
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = x + x_input # the dimensions needs to be match. 
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.conv2(x, edge_index, edge_type)
        x = self.conv3(x, edge_index, edge_type)

        #######################

        return x

## Train the model 

In [11]:
# Set up device and prepare input features.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if use_embedding:
    # When using embeddings, use node indices as input.
    input_x = torch.arange(data.num_nodes, device=device)
else:
    input_x = data.x.to(device)

# Create the model.
# You can use either RGCN and RGCN2 built above for training and evaluation.
model = RGCN(num_nodes=data.num_nodes,
             in_channels=32,           # Adjust based on your needs.
             hidden_channels=64,
             out_channels=dataset.num_classes,
             num_relations=dataset.num_relations,
             num_bases=10,
             use_embedding=use_embedding).to(device)

data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(input_x, data.edge_index, data.edge_type)
    # Compute cross-entropy loss on training nodes (assuming data.train_idx exists)
    loss = F.cross_entropy(out[data.train_idx], data.train_y.squeeze())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test():
    model.eval()
    out = model(input_x, data.edge_index, data.edge_type)
    pred = out.argmax(dim=1)
    train_correct = pred[data.train_idx].eq(data.train_y.squeeze()).sum().item()
    test_correct = pred[data.test_idx].eq(data.test_y.squeeze()).sum().item()
    train_acc = train_correct / data.train_idx.size(0)
    test_acc = test_correct / data.test_idx.size(0)
    return train_acc, test_acc

# Training loop
for epoch in range(0, 100):
    loss = train()
    train_acc, test_acc = test()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 000, Loss: 1.9114, Train Acc: 0.4714, Test Acc: 0.3889
Epoch: 010, Loss: 0.6358, Train Acc: 0.8214, Test Acc: 0.6667
Epoch: 020, Loss: 0.3286, Train Acc: 0.9500, Test Acc: 0.8056
Epoch: 030, Loss: 0.1364, Train Acc: 0.9857, Test Acc: 0.7778
Epoch: 040, Loss: 0.0695, Train Acc: 0.9857, Test Acc: 0.8333
Epoch: 050, Loss: 0.0491, Train Acc: 1.0000, Test Acc: 0.8333
Epoch: 060, Loss: 0.0350, Train Acc: 1.0000, Test Acc: 0.8333
Epoch: 070, Loss: 0.0223, Train Acc: 1.0000, Test Acc: 0.8333
Epoch: 080, Loss: 0.0227, Train Acc: 1.0000, Test Acc: 0.8333
Epoch: 090, Loss: 0.0154, Train Acc: 1.0000, Test Acc: 0.8333
