This notebook attempts to gather and organize all our code in one place such that whoever is grading our team finds it easy to consult.

In [None]:
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.datasets import Planetoid, PPI
from torch_geometric.nn import GAT
from datetime import datetime
import sklearn
import numpy as np
import scipy


Our first important step was to figure out how to load the data and run it through the GAT model. We discovered that both the data and an implementation of the model are available in the Pytorch Geometric library. We ran it as follows:

In [None]:

for dataset in ['citeseer', 'cora', 'pubmed']:
    start = datetime.now()
    # Load the data
    dataset = Planetoid(root=f'../data/{dataset}', name=dataset)
    model = GAT(
        in_channels=dataset.num_features,
        out_channels=dataset.num_classes,
        hidden_channels=8,
        num_layers=2,
        heads=8,
        dropout=0.6,
        act='elu',
        act_first=True
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    # Train the model
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(dataset.x, dataset.edge_index)
        loss = F.cross_entropy(out[dataset.train_mask], dataset.y[dataset.train_mask])
        loss.backward()
        optimizer.step()

    # Test the model
    model.eval()
    out = model(dataset.x, dataset.edge_index)
    pred = out.argmax(dim=1)
    acc = pred[dataset.test_mask].eq(dataset.y[dataset.test_mask]).sum().item() / int(dataset.test_mask.sum())
    print('\n\n*****************************************************************************************************\n')
    print(f'                                         {dataset} ')
    print(f'                                         Total Epochs: 200')
    print(f'                                         Test Accuracy: {acc:.4f}')
    print(f'                                         Time Taken: {datetime.now() - start}')
    print('\n*****************************************************************************************************\n\n')

We reused variants of the above code several times.

Note that we should not have run the above model on the Pubmed dataset, because the model used on the Pubmed data in the paper is slightly different than the model used on the Cora and Citeseer datasets. We addressed this mistake later.

We also ran the model on the PPI dataset as follows. We don't recommend actually running this because it is very slow and requires a lot of memory.

In [None]:
ppi_train = PPI('../data/ppi/')

model = GAT(
    in_channels=ppi_train.num_features,
    out_channels=ppi_train.num_classes,
    hidden_channels=256,
    num_layers=3,
    heads=4,
    dropout=0.6,
    act='elu',
    act_first=True
)

# weight_decay applies L2 regularization on the model's parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

start = datetime.now()
# Train model
for epoch in range(2):
    print(epoch)
    model.train()
    optimizer.zero_grad()
    out = model(ppi_train.x, ppi_train.edge_index)
    loss = F.cross_entropy(out, ppi_train.y)
    loss.backward()
    optimizer.step()

    # Evaluate model
    model.eval()
    pred = model(ppi_train.x, ppi_train.edge_index) > .5
    f1 = sklearn.metrics.f1_score(ppi_train.y.detach().numpy(), pred.detach().numpy(), average='micro')
    #print(f'Epoch {epoch + 1:03d}, Loss: {loss:.4f}, F1: {f1}')


# Test the model
ppi_test = PPI('../data/ppi/', 'test')
model.eval()
out = model(ppi_test.x, ppi_test.edge_index) > .5
f1 = sklearn.metrics.f1_score(ppi_test.y.detach().numpy(), out.detach().numpy(), average='micro')
print('\n\n*****************************************************************************************************\n')
print(f'                                         PPI Dataset ')
print(f'                                         Total Epochs: 200')
print(f'                                         F1 Score: {f1:.4f}')
print(f'                                         Time Taken: {datetime.now() - start}')
print('\n*****************************************************************************************************\n\n')

At this point we realized that Pytorch Geometric's `GAT` class does not allow us to properly replicate the paper's methodology. Specifically, this implementation does not allow different layers within the model to have different numbers of attention heads or different activation functions.

The next thing we did was make our own GAT implementation using Pytorch Geometric's `GATConv` class, which implements a single GAT layer. The following adheres to the architecture used in the paper on the Cora and Citeseer datasets.

In [None]:
class GATCora(torch.nn.Module):
    def __init__(self, in_channels, n_classes):
        super().__init__()
        self.conv1 = torch_geometric.nn.GATConv(heads=8, out_channels=8, in_channels=in_channels, dropout=.6)
        self.act1 = torch.nn.ELU()
        self.conv2 = torch_geometric.nn.GATConv(heads=1, out_channels=n_classes, in_channels=64, dopout=.6)
        self.act2 = torch.nn.Softmax(dim=1)

    def forward(self, x, edge_index):
        x = self.act1(self.conv1(x, edge_index))
        x = self.act2(self.conv2(x, edge_index))
        return x
    
for dataset in ['citeseer', 'cora']:
    start = datetime.now()
    dataset = Planetoid(root=f'../data/{dataset}', name=dataset)
    model = GATCora(dataset.num_features, dataset.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(dataset.x, dataset.edge_index)
        loss = F.cross_entropy(out[dataset.train_mask], dataset.y[dataset.train_mask])
        loss.backward()
        optimizer.step()

    model.eval()
    out = model(dataset.x, dataset.edge_index)
    pred = out.argmax(dim=1)
    acc = pred[dataset.test_mask].eq(dataset.y[dataset.test_mask]).sum().item() / int(dataset.test_mask.sum())
    print('\n\n*****************************************************************************************************\n')
    print(f'                                         {dataset} ')
    print(f'                                         Total Epochs: 200')
    print(f'                                         Test Accuracy: {acc:.4f}')
    print(f'                                         Time Taken: {datetime.now() - start}')
    print('\n*****************************************************************************************************\n\n')

Similarly, the following is for the Pubmed dataset.

In [None]:
class GATPubmed(torch.nn.Module):
    def __init__(self, in_channels, n_classes):
        super().__init__()
        self.conv1 = torch_geometric.nn.GATConv(heads=8, out_channels=8, in_channels=in_channels, dropout=.6)
        self.act1 = torch.nn.ELU()
        self.conv2 = torch_geometric.nn.GATConv(heads=8, out_channels=n_classes, in_channels=64, dropout=.6, concat=False)
        self.act2 = torch.nn.Softmax(dim=1)

    def forward(self, x, edge_index):
        x = self.act1(self.conv1(x, edge_index))
        x = self.act2(self.conv2(x, edge_index))
        return x
    
start = datetime.now()
dataset = Planetoid(root=f'../data/pubmed', name='pubmed')
model = GATPubmed(dataset.num_features, dataset.num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-3)

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(dataset.x, dataset.edge_index)
    loss = F.cross_entropy(out[dataset.train_mask], dataset.y[dataset.train_mask])
    loss.backward()
    optimizer.step()

model.eval()
out = model(dataset.x, dataset.edge_index)
pred = out.argmax(dim=1)
acc = pred[dataset.test_mask].eq(dataset.y[dataset.test_mask]).sum().item() / int(dataset.test_mask.sum())
print('\n\n*****************************************************************************************************\n')
print(f'                                         Pubmed ')
print(f'                                         Total Epochs: 200')
print(f'                                         Test Accuracy: {acc:.4f}')
print(f'                                         Time Taken: {datetime.now() - start}')
print('\n*****************************************************************************************************\n\n')

We didn't make a new model for the PPI data because it took a very long time to run the first time and we wanted to play around with other stuff. Unfortunately, we never got the time to come back to the PPI model, and we discarded the results from our first and only run.

We then realized the paper described an early stopping criterion, so we added it into our procedure as well. If we take Citeseer as an example, the training code would look as follows with the early stopping criterion:

In [None]:
# Set up `optimizer`, `model` and `dataset` before this, as usual.
best_epoch = 0
best_loss = 1e10
patience = 100
best_acc = 0.0
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(dataset.x, dataset.edge_index)
    loss = F.cross_entropy(out[dataset.train_mask], dataset.y[dataset.train_mask])
    loss.backward()
    optimizer.step()

    model.eval()
    pred = model(dataset.x, dataset.edge_index).argmax(dim=1)
    correct = int(pred[dataset.train_mask].eq(dataset.y[dataset.train_mask]).sum().item())
    acc = correct / int(dataset.train_mask.sum())

    if (acc >= best_acc) or (loss <= best_loss):
        best_acc = np.max((acc, best_acc))
        best_epoch = np.max((epoch, best_epoch))
        best_loss = np.min((loss.detach().numpy(), best_loss))

    if epoch - best_epoch > patience:
        break

We then decided to attempt to reimplement the GAT model using Pytorch instead of Pytorch Geometric. This would help us gain a deeper understanding of how the model works and would allow to do more interesting kinds of ablations studies, since it would allow us to control in detail what happens within each layer.

Unfortunately, both our attempts to rewrite GAT from scratch failed. We appear to have made at least one mistake in each case, such that the models don't improve past roughly 45% to 50% accuracy on the training data from the Citeseer dataset, whereas we had seen our previous models reach accuracies well above 80% on training data.

The following two cells are our two attempts at implementing GAT from scratch. The first is based on the paper's implementation (https://github.com/PetarV-/GAT) and the second is based on GATLayerImp2 (https://github.com/gordicaleksa/pytorch-GAT/blob/main/models/definitions/GAT.py#L349).

In [None]:
class GATHead(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias, in_drop=0.0, coef_drop=0.0, residual=False):
        super().__init__()
        self.bias1 = torch.nn.Parameter(bias)
        self.bias2 = torch.nn.Parameter(torch.zeros(out_dim))
        self.in_drop = in_drop
        self.coef_drop = coef_drop
        self.residual = residual

        self.conv1 = torch.nn.Conv1d(in_dim, out_dim, 1, bias=False)
        self.conv2 = torch.nn.Conv1d(out_dim, 1, 1)
        self.conv3 = torch.nn.Conv1d(in_dim, out_dim, 1)

    def forward(self, data):
        if self.in_drop != 0.0:
            data = torch.nn.functional.dropout(data, 1.0 - self.in_drop)

        feats = self.conv1(data.permute(0, 2, 1))

        f_1 = self.conv2(feats)
        logits = f_1 + f_1.permute(0, 2, 1)
        coefs = torch.nn.functional.softmax(torch.nn.functional.leaky_relu(logits) + self.bias1, dim=-1)

        if self.coef_drop != 0.0:
            coefs = torch.nn.functional.dropout(coefs, 1.0 - self.coef_drop)
        if self.in_drop != 0.0:
            feats = torch.nn.functional.dropout(feats, 1.0 - self.in_drop)

        vals = torch.matmul(coefs.float(), feats.permute(0, 2, 1))
        ret = vals + self.bias2

        if self.residual:
            if data.shape[-1] != ret.shape[-1]:
                ret = ret + self.conv3(data)
            else:
                ret = ret + data

        return ret

class GATLayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, n_heads, bias, in_drop, coef_drop, residual, concat):
        super().__init__()
        self.heads = torch.nn.ParameterList([GATHead(in_dim, out_dim, bias, in_drop, coef_drop, residual) for i in range(n_heads)])
        self.concat = concat

    def forward(self, data):
        head_out = [self.heads[i](data) for i in range(len(self.heads))]
        if self.concat:
            return torch.concat(head_out, -1)
        return torch.sum(torch.stack(head_out, -1), dim=-1) / len(head_out)

class GAT(torch.nn.Module):
    def __init__(self, n_features, n_classes, bias):
        super().__init__()
        self.conv1 = GATLayer(n_features, 8, 8, bias, .6, .6, False, True)
        self.conv2 = GATLayer(64, n_classes, 1, bias, .6, .6, False, False)

    def forward(self, x):
        x = torch.nn.functional.elu(self.conv1(x))
        x = self.conv2(x)
        return torch.nn.functional.softmax(x, dim=-1)

# Taken directly from https://github.com/PetarV-/GAT/blob/master/utils/process.py
def adj_to_bias(adj, sizes, nhood=1):
    nb_graphs = adj.shape[0]
    mt = np.empty(adj.shape)
    for g in range(nb_graphs):
        mt[g] = np.eye(adj.shape[1])
        for _ in range(nhood):
            mt[g] = np.matmul(mt[g], (adj[g] + np.eye(adj.shape[1])))
        for i in range(sizes[g]):
            for j in range(sizes[g]):
                if mt[g][i][j] > 0.0:
                    mt[g][i][j] = 1.0
    return -1e9 * (1.0 - mt)

def planetoid_adj_to_petarv_adj(adj):
    size = adj.max() + 1
    petarv_adj = scipy.sparse.csr_array((size, size))
    for row in adj:
        petarv_adj[row[0], row[1]] = 1
    return petarv_adj

for dataset in ['citeseer', 'cora']:
    data = torch_geometric.datasets.Planetoid(root=f'../data/{dataset}', name=dataset)
    # data.edge_index is a (2, C) matrix, where C is the number of citations in the data.
    # Each row indicates the indices of which paper cites which. This needs to be
    # converted into a (P, P) matrix, where P is the number of papers in the data, and
    # each entry indicates whether there is a citation between the corresponding papers.
    conn = torch.tensor(adj_to_bias(planetoid_adj_to_petarv_adj(data.edge_index).todense()[np.newaxis], [data.x.shape[0]], nhood=1), requires_grad=False)
    model = GAT(data.num_features, data.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=.005, weight_decay=5e-4)

    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data.x[np.newaxis], conn).squeeze()
        loss = torch.nn.functional.cross_entropy(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print(out.argmax(dim=1)[data.train_mask].eq(data.y[data.train_mask]).sum().item() / int(data.train_mask.sum()))

    model.eval()
    out = model(data.x[np.newaxis]).squeeze()
    pred = out.argmax(dim=1)
    acc = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item() / int(data.test_mask.sum())
    print('\n\n*****************************************************************************************************\n')
    print(f'                                         {dataset} ')
    print(f'                                         Total Epochs: 200')
    print(f'                                         Test Accuracy: {acc:.4f}')
    print('\n*****************************************************************************************************\n\n')

The following cells are our 2nd attempt at implementing GAT from scratch. This is based on
GATLayerImp2 (https://github.com/gordicaleksa/pytorch-GAT/blob/main/models/definitions/GAT.py#L349).

In [None]:
class GATLayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, n_heads, dropout, concat=True):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.n_heads = n_heads
        self.lin = torch.nn.Linear(in_dim, n_heads*out_dim)
        self.src_proj = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.Tensor(1, n_heads, out_dim)))
        self.trg_proj = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.Tensor(1, n_heads, out_dim)))
        self.dropout = torch.nn.Dropout(dropout)
        self.leakyRelu = torch.nn.LeakyReLU()
        self.softmax = torch.nn.Softmax(dim=-1)
        self.concat = concat

    def forward(self, data, connectivity):
        data1 = self.lin(self.dropout(data)).view(-1, self.n_heads, self.out_dim)
        data2 = self.dropout(data1)
        src_scores = (data2 * self.src_proj).sum(dim=-1, keepdim=True).permute(1, 0, 2)
        trg_scores = (data2 * self.trg_proj).sum(dim=-1, keepdim=True).permute(1, 2, 0)
        scores = self.softmax(self.leakyRelu(src_scores + trg_scores) + connectivity)
        data3 = torch.bmm(scores.float(), data2.permute(1, 0, 2).float()).permute(1, 0, 2)

        if self.concat:
            data4 = data3.reshape(-1, self.n_heads*self.out_dim)
        else:
            data4 = data3.mean(dim=1)
            
        return data4

class GATCora(torch.nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.layer1 = GATLayer(n_features, 8, 8, .6, True)
        self.elu = torch.nn.ELU()
        self.layer2 = GATLayer(64, n_classes, 1, .6, False)
        self.softmax = torch.nn.Softmax(dim=-1)

    def forward(self, x, connectivity):
        x = self.elu(self.layer1(x, connectivity))
        return self.softmax(self.layer2(x, connectivity))

for dataset in ['citeseer', 'cora']:
    data = torch_geometric.datasets.Planetoid(root=f'../data/{dataset}', name=dataset)
    conn = torch.tensor(adj_to_bias(planetoid_adj_to_petarv_adj(data.edge_index).todense()[np.newaxis], [data.x.shape[0]], nhood=1), requires_grad=False).squeeze()
    model = GATCora(data.num_features, data.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=.005, weight_decay=5e-4)
    
    for epoch in range(200):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, conn).squeeze()
        loss = torch.nn.functional.cross_entropy(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print(out.argmax(dim=1)[data.train_mask].eq(data.y[data.train_mask]).sum().item() / int(data.train_mask.sum()))

    model.eval()
    out = model(data.x).squeeze()
    pred = out.argmax(dim=1)
    acc = pred[data.test_mask].eq(data.y[data.test_mask]).sum().item() / int(data.test_mask.sum())
    print('\n\n*****************************************************************************************************\n')
    print(f'                                         {dataset} ')
    print(f'                                         Total Epochs: 200')
    print(f'                                         Test Accuracy: {acc:.4f}')
    print('\n*****************************************************************************************************\n\n')

Lastly, we did a couple of ablation studies. Specifically, we ran the Cora, Citeseer and Pubmed datasets through two single-layer GATs and a three-layer GAT.

In [None]:
class SingleLayerGAT1(torch.nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.conv = torch_geometric.nn.GATConv(heads=8, out_channels=n_classes, in_channels=n_features, dropout=.6)
        self.act = torch.nn.Softmax(dim=1)
        
    def forward(self, x, edge_index):
        return self.act(self.conv(x, edge_index))
    


In [None]:
class SingleLayerGAT2(torch.nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.conv = torch_geometric.nn.GATConv(heads=8, out_channels=n_classes, in_channels=n_features, dropout=.6)
        self.act1 = torch.nn.ELU()
        self.act2 = torch.nn.Softmax(dim=1)

    def forward(self, x, edge_index):
        return self.act2(self.act1(self.conv(x, edge_index)))


In [None]:

class ThreeLayerGAT(torch.nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.conv1 = torch_geometric.nn.GATConv(heads=8, out_channels=8, in_channels=n_features, dropout=.6)
        self.act1 = torch.nn.ELU()
        self.conv2 = torch_geometric.nn.GATConv(heads=8, out_channels=8, in_channels=64, dropout=.6)
        self.act2 = torch.nn.ELU()
        self.conv3 = torch_geometric.nn.GATConv(heads=1, out_channels=n_classes, in_channels=64, dropout=.6)
        self.act3 = torch.nn.Softmax(dim=1)

    def forward(self, x, edge_index):
        return self.act3(self.conv3(self.act2(self.conv2(self.act1(self.conv1(x, edge_index)), edge_index)), edge_index))

The code to train these models is the same as for our other Pytorch Geometric-based implementations.