# **Hands On Graph Neural Networks Using Python  -  CHAPTER 5**

- This code demonstrates the use of Multilayer Perceptron (MLP) and Vanilla Graph Neural Network (GNN) models for node classification on graph datasets using PyTorch and PyTorch Geometric. It starts by installing necessary libraries and setting seeds for reproducibility.

- It then loads and analyzes two datasets, "Cora" and "FacebookPagePage", and provides methods to examine their properties. The code defines an MLP and a basic GNN model, each with methods for training and testing. Finally, it trains and evaluates both models on the datasets, printing performance metrics to compare their effectiveness.






In [2]:
!pip install -q torch-scatter~=2.1.0 torch-sparse~=0.6.16 torch-cluster~=1.6.0 torch-spline-conv~=1.2.1 torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-{torch.__version__}.html

import torch
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.datasets import Planetoid, FacebookPagePage
from torch_geometric.utils import to_dense_adj
import pandas as pd

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m947.1/947.1 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### **SeedInitializer Class**

- The SeedInitializer class provides a static method to initialize random seeds for ensuring reproducible results.

- This method sets the seed for various PyTorch components to avoid variations in output across different runs.

In [20]:
# Initialize random seeds for reproducibility
class SeedInitializer:
    @staticmethod
    def initialize_seed(seed=0):
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Initialize seed
SeedInitializer.initialize_seed()


### **DatasetLoader Class**

- The DatasetLoader class is responsible for loading and analyzing graph datasets using PyTorch Geometric.

- It supports loading the "Cora" and "FacebookPagePage" datasets, and provides methods to print detailed information about the datasets and their graph structures.

In [21]:
# Load and analyze datasets
class DatasetLoader:
    def __init__(self, dataset_name="Cora", root="."):
        if dataset_name == "Cora":
            self.dataset = Planetoid(root=root, name=dataset_name)
        elif dataset_name == "FacebookPagePage":
            self.dataset = FacebookPagePage(root=root)
        self.data = self.dataset[0]

    def print_dataset_info(self):
        print(f'Dataset: {self.dataset}')
        print('---------------')
        print(f'Number of graphs: {len(self.dataset)}')
        print(f'Number of nodes: {self.data.x.shape[0]}')
        print(f'Number of features: {self.dataset.num_features}')
        print(f'Number of classes: {self.dataset.num_classes}')

    def print_graph_info(self):
        print(f'\nGraph:')
        print('------')
        print(f'Edges are directed: {self.data.is_directed()}')
        print(f'Graph has isolated nodes: {self.data.has_isolated_nodes()}')
        print(f'Graph has loops: {self.data.has_self_loops()}')


### **MLP Class**

- The MLP (Multilayer Perceptron) class defines a simple neural network model for node classification in graphs.

- It includes methods for training (fit), testing (test), and calculating accuracy. The model uses two fully connected layers and applies ReLU activation.

In [22]:
# Define MLP model
class MLP(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.linear1 = Linear(dim_in, dim_h)
        self.linear2 = Linear(dim_h, dim_out)

    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.linear2(x)
        return F.log_softmax(x, dim=1)

    def fit(self, data, epochs, learning_rate=0.01, weight_decay=5e-4):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate, weight_decay=weight_decay)

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = self.accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = self.accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

    @torch.no_grad()
    def test(self, data):
        self.eval()
        out = self(data.x)
        acc = self.accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

    @staticmethod
    def accuracy(y_pred, y_true):
        return torch.sum(y_pred == y_true).item() / len(y_true)


VanillaGNNLayer Class

- The VanillaGNNLayer class defines a single layer of a basic Graph Neural Network (GNN).

- It applies a linear transformation to the input features and then aggregates information from neighboring nodes using the adjacency matrix.

In [23]:
# Define Vanilla GNN layer and model
class VanillaGNNLayer(torch.nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.linear = Linear(dim_in, dim_out, bias=False)

    def forward(self, x, adjacency):
        x = self.linear(x)
        x = torch.sparse.mm(adjacency, x)
        return x


### **VanillaGNN Class**

- The VanillaGNN class defines a basic two-layer Graph Neural Network (GNN) model for node classification.

- It stacks two VanillaGNNLayer layers, applying ReLU activation between them. The model includes methods for training (fit), testing (test), and forwarding data through the network.

In [24]:
class VanillaGNN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gnn1 = VanillaGNNLayer(dim_in, dim_h)
        self.gnn2 = VanillaGNNLayer(dim_h, dim_out)

    def forward(self, x, adjacency):
        h = self.gnn1(x, adjacency)
        h = torch.relu(h)
        h = self.gnn2(h, adjacency)
        return F.log_softmax(h, dim=1)

    def fit(self, data, adjacency, epochs, learning_rate=0.01, weight_decay=5e-4):
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate, weight_decay=weight_decay)

        self.train()
        for epoch in range(epochs+1):
            optimizer.zero_grad()
            out = self(data.x, adjacency)
            loss = criterion(out[data.train_mask], data.y[data.train_mask])
            acc = MLP.accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
            loss.backward()
            optimizer.step()

            if epoch % 20 == 0:
                val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
                val_acc = MLP.accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])
                print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc:'
                      f' {acc*100:>5.2f}% | Val Loss: {val_loss:.2f} | '
                      f'Val Acc: {val_acc*100:.2f}%')

    @torch.no_grad()
    def test(self, data, adjacency):
        self.eval()
        out = self(data.x, adjacency)
        acc = MLP.accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
        return acc

### **GraphUtils Class**

- The GraphUtils class provides utility functions for graph operations. Specifically, it includes a method to create an adjacency matrix from edge indices in the graph, adding self-loops to ensure each node is connected to itself.


In [25]:
# Utility function for creating adjacency matrix
class GraphUtils:
    @staticmethod
    def create_adjacency_matrix(data):
        adjacency = to_dense_adj(data.edge_index)[0]
        adjacency += torch.eye(len(adjacency))
        return adjacency


### **Load and analyze Cora dataset**

In [26]:
cora_loader = DatasetLoader("Cora")
cora_loader.print_dataset_info()
cora_loader.print_graph_info()

Dataset: Cora()
---------------
Number of graphs: 1
Number of nodes: 2708
Number of features: 1433
Number of classes: 7

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


### **Convert data to DataFrame**

In [27]:
df_x = pd.DataFrame(cora_loader.data.x.numpy())
df_x['label'] = pd.DataFrame(cora_loader.data.y)
print(df_x.head())

     0    1    2    3    4    5    6    7    8    9  ...  1424  1425  1426  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   1427  1428  1429  1430  1431  1432  label  
0   0.0   0.0   0.0   0.0   0.0   0.0      3  
1   0.0   0.0   0.0   0.0   0.0   0.0      4  
2   0.0   0.0   0.0   0.0   0.0   0.0      4  
3   0.0   0.0   0.0   0.0   0.0   0.0      0  
4   0.0   0.0   0.0   0.0   0.0   0.0      3  

[5 rows x 1434 columns]


### **MLP model**

In [28]:
mlp = MLP(cora_loader.dataset.num_features, 16, cora_loader.dataset.num_classes)
print(mlp)
mlp.fit(cora_loader.data, epochs=100)
acc = mlp.test(cora_loader.data)
print(f'\nMLP test accuracy: {acc*100:.2f}%')

MLP(
  (linear1): Linear(in_features=1433, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=7, bias=True)
)
Epoch   0 | Train Loss: 1.959 | Train Acc: 14.29% | Val Loss: 2.00 | Val Acc: 12.40%
Epoch  20 | Train Loss: 0.110 | Train Acc: 100.00% | Val Loss: 1.46 | Val Acc: 49.40%
Epoch  40 | Train Loss: 0.014 | Train Acc: 100.00% | Val Loss: 1.44 | Val Acc: 51.00%
Epoch  60 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.40 | Val Acc: 53.80%
Epoch  80 | Train Loss: 0.008 | Train Acc: 100.00% | Val Loss: 1.37 | Val Acc: 55.40%
Epoch 100 | Train Loss: 0.009 | Train Acc: 100.00% | Val Loss: 1.34 | Val Acc: 54.60%

MLP test accuracy: 53.40%


### **Vanilla GNN model**

In [29]:
adjacency = GraphUtils.create_adjacency_matrix(cora_loader.data)
gnn = VanillaGNN(cora_loader.dataset.num_features, 16, cora_loader.dataset.num_classes)
print(gnn)
gnn.fit(cora_loader.data, adjacency, epochs=100)
acc = gnn.test(cora_loader.data, adjacency)
print(f'\nGNN test accuracy: {acc*100:.2f}%')

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=1433, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=7, bias=False)
  )
)
Epoch   0 | Train Loss: 1.991 | Train Acc: 15.71% | Val Loss: 2.11 | Val Acc: 9.40%
Epoch  20 | Train Loss: 0.065 | Train Acc: 99.29% | Val Loss: 1.47 | Val Acc: 76.80%
Epoch  40 | Train Loss: 0.014 | Train Acc: 100.00% | Val Loss: 2.11 | Val Acc: 75.40%
Epoch  60 | Train Loss: 0.007 | Train Acc: 100.00% | Val Loss: 2.22 | Val Acc: 75.40%
Epoch  80 | Train Loss: 0.004 | Train Acc: 100.00% | Val Loss: 2.20 | Val Acc: 76.80%
Epoch 100 | Train Loss: 0.003 | Train Acc: 100.00% | Val Loss: 2.19 | Val Acc: 77.00%

GNN test accuracy: 76.60%


### **Load and analyze FacebookPagePage dataset**

In [30]:
facebook_loader = DatasetLoader("FacebookPagePage")
facebook_loader.print_dataset_info()
facebook_loader.print_graph_info()

Dataset: FacebookPagePage()
---------------
Number of graphs: 1
Number of nodes: 22470
Number of features: 128
Number of classes: 4

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: True


### **Create masks for FacebookPagePage dataset**

In [31]:
facebook_loader.data.train_mask = range(18000)
facebook_loader.data.val_mask = range(18001, 20000)
facebook_loader.data.test_mask = range(20001, 22470)

In [32]:
# Adjacency matrix
adjacency = GraphUtils.create_adjacency_matrix(facebook_loader.data)

### **MLP on FacebookPagePage**

In [33]:
mlp = MLP(facebook_loader.dataset.num_features, 16, facebook_loader.dataset.num_classes)
print(mlp)
mlp.fit(facebook_loader.data, epochs=100)
acc = mlp.test(facebook_loader.data)
print(f'\nMLP test accuracy: {acc*100:.2f}%\n')

MLP(
  (linear1): Linear(in_features=128, out_features=16, bias=True)
  (linear2): Linear(in_features=16, out_features=4, bias=True)
)
Epoch   0 | Train Loss: 1.401 | Train Acc: 28.11% | Val Loss: 1.40 | Val Acc: 28.91%
Epoch  20 | Train Loss: 0.671 | Train Acc: 73.47% | Val Loss: 0.68 | Val Acc: 72.94%
Epoch  40 | Train Loss: 0.579 | Train Acc: 76.95% | Val Loss: 0.61 | Val Acc: 74.89%
Epoch  60 | Train Loss: 0.549 | Train Acc: 78.20% | Val Loss: 0.60 | Val Acc: 75.59%
Epoch  80 | Train Loss: 0.533 | Train Acc: 78.76% | Val Loss: 0.60 | Val Acc: 75.39%
Epoch 100 | Train Loss: 0.520 | Train Acc: 79.23% | Val Loss: 0.60 | Val Acc: 75.39%

MLP test accuracy: 75.33%



### **GCN on FacebookPagePage**

In [34]:
gnn = VanillaGNN(facebook_loader.dataset.num_features, 16, facebook_loader.dataset.num_classes)
print(gnn)
gnn.fit(facebook_loader.data, adjacency, epochs=100)
acc = gnn.test(facebook_loader.data, adjacency)
print(f'\nGNN test accuracy: {acc*100:.2f}%')

VanillaGNN(
  (gnn1): VanillaGNNLayer(
    (linear): Linear(in_features=128, out_features=16, bias=False)
  )
  (gnn2): VanillaGNNLayer(
    (linear): Linear(in_features=16, out_features=4, bias=False)
  )
)
Epoch   0 | Train Loss: 176.683 | Train Acc: 28.31% | Val Loss: 173.10 | Val Acc: 28.41%
Epoch  20 | Train Loss: 6.675 | Train Acc: 79.69% | Val Loss: 4.49 | Val Acc: 80.19%
Epoch  40 | Train Loss: 2.284 | Train Acc: 82.15% | Val Loss: 1.60 | Val Acc: 83.64%
Epoch  60 | Train Loss: 1.233 | Train Acc: 83.91% | Val Loss: 1.06 | Val Acc: 84.34%
Epoch  80 | Train Loss: 0.812 | Train Acc: 84.94% | Val Loss: 0.78 | Val Acc: 84.79%
Epoch 100 | Train Loss: 1.321 | Train Acc: 84.33% | Val Loss: 0.96 | Val Acc: 84.39%

GNN test accuracy: 82.67%
