In [113]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1, 1, 2], [1, 0, 2, 1]], dtype=torch.long)
node_features = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=node_features, edge_index=edge_index, edge_attr=torch.randn(4, 10))

Data(edge_index=[2, 4], x=[3, 1])

Data(x=[2], edge_index=[2])

In [114]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

In [214]:
# Basically the same as the baseline except we pass edge features
from torch_geometric.nn import GATConv, GATv2Conv
from torch_geometric.nn import global_mean_pool


class GATModel(nn.Module):
    def __init__(self, node_embed_size=16, hidden_size=32, num_edge_features=10, max_nodes=100):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.node_embedding = nn.Embedding(max_nodes, node_embed_size)
        # Initialize the embeddings with small random values
        nn.init.normal_(self.node_embedding.weight, std=0.1)
        self.convs = [
            GATv2Conv(node_embed_size, self.hidden_size, edge_dim=num_edge_features),
            GATv2Conv(self.hidden_size, self.hidden_size, edge_dim=num_edge_features),
        ]
        self.linear = nn.Linear(self.hidden_size, 2)

    def forward(self, data):
        edge_index, edge_attr = data.edge_index, data.edge_attr
        # want to get the node embeddings for the nodes in the batch
        # get the node numbers for each graph in the batch (i.e. the number of nodes in each graph)
        # data.batch is a tensor of size [num_nodes] that maps each node to its graph
        
        x = self.node_embedding.weight[:data.num_nodes,:]
        for conv in self.convs[:-1]:
            x = conv(x, edge_index, edge_attr=edge_attr)  # adding edge features here!
            x = F.relu(x)
            x = F.dropout(x, training=self.training)

        x = self.convs[-1](
            x, edge_index, edge_attr=edge_attr
        )  # edge features here as well
        x = global_mean_pool(x, data.batch)  # [batch_size, hidden_channels]
        x = self.linear(x)
        return x

In [215]:
model = GATModel(hidden_size=32, num_edge_features=10, max_nodes=400)
from torch_geometric.data import DataLoader

dataloader = DataLoader([data, data], batch_size=2)
for batch in dataloader:
    print(model(batch))



RuntimeError: index 401 is out of bounds for dimension 0 with size 400

In [216]:
# Generate some fake test graph data
from typing import Callable, Optional
from torch_geometric.data import InMemoryDataset

class FakeDataset(torch.utils.data.Dataset):
    def __init__(self, size: int = 100, mean_sep: float = 3.0):
        self.size = size
        self.data_list = [
            self.generate_random_graph(mean_edge=0.0) for _ in range(size)
        ]
        self.data_list += [
            self.generate_random_graph(mean_edge=mean_sep) for _ in range(size)
        ]

    def __len__(self):
        return self.size * 2

    def __getitem__(self, idx):
        return self.data_list[idx]

    def generate_random_graph(self, mean_edge: float = 0.0):
        edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)

        y_label = (
            torch.tensor([0], dtype=torch.long)
            if mean_edge <= 0.50
            else torch.tensor([1], dtype=torch.long)
        )
        edges_attributes = mean_edge + torch.randn(4, 10)
        data_obj = Data(
            edge_index=edge_index,
            edge_attr=edges_attributes,
            y=y_label,
        )
        data_obj.num_nodes = 3
        return data_obj

In [217]:
datalist = FakeDataset(size=100)

In [218]:
print(datalist[98])

Data(edge_index=[2, 4], edge_attr=[4, 10], y=[1], num_nodes=3)


In [219]:
fdataloader = DataLoader(FakeDataset(size=256), batch_size=64, shuffle=True)
model = GATModel(node_embed_size=16, hidden_size=64, num_edge_features=10, max_nodes=10000)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(1,200):
    for batch in fdataloader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch.y)
        loss.backward()
        optimizer.step()

    if (epoch) % 10 == 0:
        print(f"Epoch {epoch} loss: {loss.item()}")
# define a training loop

Epoch 10 loss: 0.7044002413749695
Epoch 20 loss: 0.6833053231239319
Epoch 30 loss: 0.713189423084259
Epoch 40 loss: 0.28837788105010986
Epoch 50 loss: 0.025703471153974533
Epoch 60 loss: 0.0084714749827981
Epoch 70 loss: 0.0133052384480834
Epoch 80 loss: 0.003203689819201827
Epoch 90 loss: 0.0004551921156235039
Epoch 100 loss: 0.0020007677376270294
Epoch 110 loss: 0.0007524141692556441
Epoch 120 loss: 0.002096351934596896
Epoch 130 loss: 0.0013564835535362363
Epoch 140 loss: 0.00028539408231154084
Epoch 150 loss: 0.0010591543978080153
Epoch 160 loss: 0.0009831078350543976
Epoch 170 loss: 0.00022370784427039325
Epoch 180 loss: 0.00019172992324456573
Epoch 190 loss: 0.00025751636712811887


# Test Model of Molecule Graph Dataset



In [208]:
import torch
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of edge features: {data.num_edge_features}')
print(f'Number of node features: {data.num_node_features}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Number of edge features: 4
Number of node features: 7
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [209]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 150
Number of test graphs: 38


In [210]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2636], x=[1188, 7], edge_attr=[2636, 4], y=[64], batch=[1188], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2506], x=[1139, 7], edge_attr=[2506, 4], y=[64], batch=[1139], ptr=[65])

Step 3:
Number of graphs in the current batch: 22
DataBatch(edge_index=[2, 852], x=[387, 7], edge_attr=[852, 4], y=[22], batch=[387], ptr=[23])



In [213]:

model = GATModel(hidden_size=128, num_edge_features=4, max_nodes=10000)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 500):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 010, Train Acc: 0.6467, Test Acc: 0.7632
Epoch: 020, Train Acc: 0.4800, Test Acc: 0.4474
Epoch: 030, Train Acc: 0.5933, Test Acc: 0.7105
Epoch: 040, Train Acc: 0.3800, Test Acc: 0.3947
Epoch: 050, Train Acc: 0.6267, Test Acc: 0.7368
Epoch: 060, Train Acc: 0.6000, Test Acc: 0.6842
Epoch: 070, Train Acc: 0.6733, Test Acc: 0.7105
Epoch: 080, Train Acc: 0.6400, Test Acc: 0.5789
Epoch: 090, Train Acc: 0.6467, Test Acc: 0.6579
Epoch: 100, Train Acc: 0.6600, Test Acc: 0.7632
Epoch: 110, Train Acc: 0.6600, Test Acc: 0.7105
Epoch: 120, Train Acc: 0.6333, Test Acc: 0.7368
Epoch: 130, Train Acc: 0.4867, Test Acc: 0.6316
Epoch: 140, Train Acc: 0.6067, Test Acc: 0.6053
Epoch: 150, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 160, Train Acc: 0.6333, Test Acc: 0.7368
Epoch: 170, Train Acc: 0.6467, Test Acc: 0.6842
Epoch: 180, Train Acc: 0.6533, Test Acc: 0.7368
Epoch: 190, Train Acc: 0.6067, Test Acc: 0.6579
Epoch: 200, Train Acc: 0.6133, Test Acc: 0.7105
Epoch: 210, Train Acc: 0.4733, Test Acc:

In [85]:
import torchmetrics

In [None]:
from torch_geometric.nn import NNConv 

class NNConvModel(nn.Module):
    
    def __init__(self, num_edge_features: int = 10, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = NNConv(in_channels=dataset.num_node_features, out_channels=32, nn=Linear(num_edge_features, 32))
        self.conv2 = NNConv(32, 64, nn=Linear(10, 32))
        