# Tutorial3: GAT implementation

## Outline

- Implementation of GAT

Official resources:
* [Code](https://dsgiitr.com/blogs/gat/)

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

## Structure

여기 나와 있는 구조를 채우는 방식으로 진행할거고, 여기 나와있는건 그냥 실제로 어떻게 구성되는지를 보여주는거고 더빠르게 잘돌리기 위한것들은 pytorch_geometric에 이미 implementation되어있으니 그걸 쓰면 된다.

In [3]:
class GATLayer(nn.Module):
    """
    Simple PyTorch Implementation of the Graph Attention layer.
    """
    def __init__(self):
        super(GATLayer, self).__init__()
      
    def forward(self, input, aj):
        print("")

## Let's start from the forward method

### Linear Transformation

$$
\bar{h'}_i = \textbf{W}\cdot \bar{h}_i
$$
with $\textbf{W}\in\mathbb R^{F'\times F}$ and $\bar{h}_i\in\mathbb R^{F}$.

$$
\bar{h'}_i \in \mathbb{R}^{F'}
$$

다시말해서 이렇게 새롭게 정의된 녀석을 만들겠다. 노드 피쳐들은 차원이 달라질수도 있다~

In [4]:
in_features = 5
out_features = 2
# 각각의 노드로부터 5개의 feature를 받아다가 output으로는 2개의 feature를 내보낼거다.

nb_nodes = 3

W = nn.Parameter(torch.zeros(size=(in_features, out_features))) #xavier paramiter inizializator
nn.init.xavier_uniform_(W.data, gain=1.414) # 초기화하는 방법입니다.

input = torch.rand(nb_nodes,in_features) # 노드의 feature를 랜덤하게 만들어줍니다. (노드 갯수, 피쳐 갯수)

# linear transformation
h = torch.mm(input, W) # matrix multiplication
N = h.size()[0]

print(h.shape)

torch.Size([3, 2])


### Attention Mechanism
앞쪽에 matrix multiplication 쪽은 했고
이제 attention mechanism을 구현해보자

![title](https://github.com/AntonioLonga/PytorchGeometricTutorial/blob/main/Tutorial3/AttentionMechanism.png?raw=1)

In [19]:
a = nn.Parameter(torch.zeros(size=(2*out_features, 1))) #xavier paramiter inizializator
# 사이즈가 2F'이니까 2를 곱해준거
nn.init.xavier_uniform_(a.data, gain=1.414)
# 그래서 a.shape을 찍어보면 4가 나온다. out_features가 2니까
print(a.shape)

leakyrelu = nn.LeakyReLU(0.2)  # LeakyReLU

torch.Size([4, 1])


In [20]:
print(N)
print(h) # torch.Size([3, 2])
print(h.repeat(1,N)) # 이런식으로 1행 N열로, 옆으로 붙이는 방법이 repeat.
print(h.repeat(1,N).view(N*N,-1)) # 이런식으로 view를 이용해서 reshape을 통해서 위아래로 붙일 수 있다.

3
tensor([[-0.4248,  1.0535],
        [-0.9987,  1.5516],
        [-0.6570,  0.9492]], grad_fn=<MmBackward0>)
tensor([[-0.4248,  1.0535, -0.4248,  1.0535, -0.4248,  1.0535],
        [-0.9987,  1.5516, -0.9987,  1.5516, -0.9987,  1.5516],
        [-0.6570,  0.9492, -0.6570,  0.9492, -0.6570,  0.9492]],
       grad_fn=<RepeatBackward0>)
tensor([[-0.4248,  1.0535],
        [-0.4248,  1.0535],
        [-0.4248,  1.0535],
        [-0.9987,  1.5516],
        [-0.9987,  1.5516],
        [-0.9987,  1.5516],
        [-0.6570,  0.9492],
        [-0.6570,  0.9492],
        [-0.6570,  0.9492]], grad_fn=<ViewBackward0>)


In [21]:
torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1) # N행 1열로 갖다 붙인녀석과 concat

tensor([[-0.4248,  1.0535, -0.4248,  1.0535],
        [-0.4248,  1.0535, -0.9987,  1.5516],
        [-0.4248,  1.0535, -0.6570,  0.9492],
        [-0.9987,  1.5516, -0.4248,  1.0535],
        [-0.9987,  1.5516, -0.9987,  1.5516],
        [-0.9987,  1.5516, -0.6570,  0.9492],
        [-0.6570,  0.9492, -0.4248,  1.0535],
        [-0.6570,  0.9492, -0.9987,  1.5516],
        [-0.6570,  0.9492, -0.6570,  0.9492]], grad_fn=<CatBackward0>)

In [22]:
a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * out_features) # 여기 마지막은 각각의 노드별로 나눠서 저장하는 거.

위에서 내부적으로 어떤 일이 일어나고있는지 보면 이 그림이 더 잘 이해가 가겠죠?
이런방식으로 모든 노드들끼리의 연결을 찾아줄 수 있다.

![title](https://github.com/AntonioLonga/PytorchGeometricTutorial/blob/main/Tutorial3/a_input.png?raw=1)

In [23]:
a_input[0] # Node 1 에 대한 녀석만 들어있는거 확인 가능.

tensor([[-0.4248,  1.0535, -0.4248,  1.0535],
        [-0.4248,  1.0535, -0.9987,  1.5516],
        [-0.4248,  1.0535, -0.6570,  0.9492]], grad_fn=<SelectBackward0>)

In [29]:
# print size of a_input and a with its name
print("a_input size: ", a_input.size())
print("a size: ", a.size())
print(torch.matmul(a_input, a))
print(torch.matmul(a_input, a).size())
print(torch.matmul(a_input, a).squeeze(2).size())#  attention 연산 (안쪽의 mm) 을 하면 scalar값이 나오기 때문에 이 차원을 지워줘여한다. 그래서 squeeze(2)를 해준다.

a_input size:  torch.Size([3, 3, 4])
a size:  torch.Size([4, 1])
tensor([[[-1.8801],
         [-1.6921],
         [-1.7692]],

        [[-3.2911],
         [-3.1030],
         [-3.1802]],

        [[-2.0505],
         [-1.8624],
         [-1.9396]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([3, 3, 1])
torch.Size([3, 3])


In [30]:
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))

In [33]:
print(a_input.shape,a.shape)
print("")
print(torch.matmul(a_input,a).shape)
print("")
print(torch.matmul(a_input,a).squeeze(2).shape)
print("")
print(torch.matmul(a_input,a).squeeze(2))
print("")
print(e)

torch.Size([3, 3, 4]) torch.Size([4, 1])

torch.Size([3, 3, 1])

torch.Size([3, 3])

tensor([[-1.8801, -1.6921, -1.7692],
        [-3.2911, -3.1030, -3.1802],
        [-2.0505, -1.8624, -1.9396]], grad_fn=<SqueezeBackward1>)

tensor([[-0.3760, -0.3384, -0.3538],
        [-0.6582, -0.6206, -0.6360],
        [-0.4101, -0.3725, -0.3879]], grad_fn=<LeakyReluBackward0>)


### Masked Attention

위에서는 전체의 노드들에 대해서 하는 방법에 대해서 보여준거지만, 실제로는 masked를 써야지만 우리가 neighboring 노드에서만 할 수 있다.
그래서 masked를 써서 할거다

In [37]:
# Masked Attention
adj = torch.randint(2, (3, 3)) # 실제로 우리가 업데이트하고자하는 adjacency matrix... 인데 여기서는 그냥 단순하게 랜덤으로

zero_vec  = -9e-15*torch.ones_like(e) # e랑 동일한 사이즈를 갖되 1 로 가득찬 텐서를 만드는게 torch.ones_like(e)
# 그리고 -9e15를 곱해줘서 0에 가깝게 만든다.
print(zero_vec.shape)

torch.Size([3, 3])


In [38]:
zero_vec

tensor([[-9.0000e-15, -9.0000e-15, -9.0000e-15],
        [-9.0000e-15, -9.0000e-15, -9.0000e-15],
        [-9.0000e-15, -9.0000e-15, -9.0000e-15]])

In [40]:
attention = torch.where(adj > 0, e, zero_vec) # 각각의 element에 대해서 adj가 0보다 크면 e를, 아니면 zero_vec를 넣어준다.

print(adj,"\n",e,"\n",zero_vec)
print("attention:")
attention

tensor([[1, 1, 0],
        [0, 0, 1],
        [1, 0, 1]]) 
 tensor([[-0.3760, -0.3384, -0.3538],
        [-0.6582, -0.6206, -0.6360],
        [-0.4101, -0.3725, -0.3879]], grad_fn=<LeakyReluBackward0>) 
 tensor([[-9.0000e-15, -9.0000e-15, -9.0000e-15],
        [-9.0000e-15, -9.0000e-15, -9.0000e-15],
        [-9.0000e-15, -9.0000e-15, -9.0000e-15]])
attention:


tensor([[-3.7602e-01, -3.3841e-01, -9.0000e-15],
        [-9.0000e-15, -9.0000e-15, -6.3604e-01],
        [-4.1010e-01, -9.0000e-15, -3.8791e-01]], grad_fn=<WhereBackward0>)

In [43]:
attention = F.softmax(attention, dim=1)
h_prime   = torch.matmul(attention, h) # mm써도 동일

In [44]:
print(h_prime)

tensor([[-0.6921,  1.1580],
        [-0.7003,  1.2286],
        [-0.7371,  1.2360]], grad_fn=<MmBackward0>)


In [45]:
torch.mm(attention,h)

tensor([[-0.6921,  1.1580],
        [-0.7003,  1.2286],
        [-0.7371,  1.2360]], grad_fn=<MmBackward0>)

In [46]:
attention

tensor([[0.2861, 0.2971, 0.4168],
        [0.3954, 0.3954, 0.2093],
        [0.2833, 0.4270, 0.2897]], grad_fn=<SoftmaxBackward0>)

In [47]:
h_prime

tensor([[-0.6921,  1.1580],
        [-0.7003,  1.2286],
        [-0.7371,  1.2360]], grad_fn=<MmBackward0>)

#### h_prime vs h

In [48]:
print(h_prime,"\n",h)

tensor([[-0.6921,  1.1580],
        [-0.7003,  1.2286],
        [-0.7371,  1.2360]], grad_fn=<MmBackward0>) 
 tensor([[-0.4248,  1.0535],
        [-0.9987,  1.5516],
        [-0.6570,  0.9492]], grad_fn=<MmBackward0>)


# Build the layer

In [54]:
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha):
        super(GATLayer, self).__init__()
        self.dropout       = dropout        # drop prob = 0.6
        self.in_features   = in_features    # 
        self.out_features  = out_features   # 
        self.alpha         = alpha          # LeakyReLU with negative input slope, alpha = 0.2

        
        # Xavier Initialization of Weights
        # Alternatively use weights_init to apply weights of choice 
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)
        
        # LeakyReLU
        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, input, adj):
        # Linear Transformation
        h = torch.mm(input, self.W) # matrix multiplication
        N = h.size()[0]
        print(N)

        # Attention Mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e       = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        # Masked Attention
        zero_vec  = -9e-15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime   = torch.matmul(attention, h)

        return h_prime

# Use it

In [55]:
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

name_data = 'Cora'
dataset = Planetoid(root= '/tmp/' + name_data, name = name_data)
dataset.transform = T.NormalizeFeatures()

print(f"Number of Classes in {name_data}:", dataset.num_classes)
print(f"Number of Node Features in {name_data}:", dataset.num_node_features)

Number of Classes in Cora: 7
Number of Node Features in Cora: 1433


In [56]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        
        
        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.hid*self.in_head, dataset.num_classes, concat=False,
                             heads=self.out_head, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
                
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x) # exponential linear unit
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)
    
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"

model = GAT().to(device)
data = dataset[0].to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    
    if epoch%200 == 0:
        print(loss)
    
    loss.backward()
    optimizer.step()
    


tensor(1.9474, grad_fn=<NllLossBackward0>)
tensor(0.6568, grad_fn=<NllLossBackward0>)
tensor(0.6009, grad_fn=<NllLossBackward0>)
tensor(0.5582, grad_fn=<NllLossBackward0>)
tensor(0.5205, grad_fn=<NllLossBackward0>)


In [52]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.8210
