## Layer

In [5]:
import torch
import torch.nn as nn

class GATLayerWithIRCRWR(nn.Module):
    # These may change in the inductive setting - leaving it like this for now (not future proof)
    nodes_dim = 0      # node dimension (axis is maybe a more familiar term nodes_dim is the position of "N" in tensor)
    head_dim = 1       # attention head dim

    def __init__(self, num_in_features, num_out_features, num_of_heads, gamma=0.1, beta=1, concat=True, activation=nn.ELU(),
                 dropout_prob=0.6, random_walk_with_restart=True, add_residual_connection=True, bias=True):
        super().__init__()

        self.num_of_heads = num_of_heads
        self.num_out_features = num_out_features
        self.concat = concat  # whether we should concatenate or average the attention heads
        self.residual_connection = add_residual_connection
        self.random_walk_with_restart = random_walk_with_restart
        self.gamma = gamma # RWR의 재시작 확률
        self.beta = beta # residual connection의 가중치

        #
        # Trainable weights: linear projection matrix (denoted as "W" in the paper), attention target/source
        # (denoted as "a" in the paper) and bias (not mentioned in the paper but present in the official GAT repo)
        #

        # You can treat this one matrix as num_of_heads independent W matrices
        self.linear_proj = nn.Linear(num_in_features, num_of_heads * num_out_features, bias=False)

        # After we concatenate target node (node i) and source node (node j) we apply the "additive" scoring function
        # which gives us un-normalized score "e". Here we split the "a" vector - but the semantics remain the same.
        # Basically instead of doing [x, y] (concatenation, x/y are node feature vectors) and dot product with "a"
        # we instead do a dot product between x and "a_left" and y and "a_right" and we sum them up
        # 즉 논문에서의 e_ij = LeakyReLU(a_T[ Wh_i ∣∣ Wh_j ])를 연산량을 줄이기 위해
        # e_ij​ = LeakyReLU((Wh_i​)⋅a_left​ + (Wh_j)⋅a_right​)로 바꿔 수행한다.
        self.scoring_fn_target = nn.Parameter(torch.Tensor(1, num_of_heads, num_out_features))
        self.scoring_fn_source = nn.Parameter(torch.Tensor(1, num_of_heads, num_out_features))

        # Bias is definitely not crucial to GAT - feel free to experiment (I pinged the main author, Petar, on this one)
        if bias and concat:
            self.bias = nn.Parameter(torch.Tensor(num_of_heads * num_out_features))
        elif bias and not concat:
            self.bias = nn.Parameter(torch.Tensor(num_out_features))
        else:
            self.register_parameter('bias', None)

        if self.residual_connection or self.random_walk_with_restart:
            self.W_residual = nn.Linear(num_in_features, num_of_heads * num_out_features, bias=False)
        else:
            self.register_parameter('W_residual', None)

        #
        # End of trainable weights
        #

        self.leakyReLU = nn.LeakyReLU(0.2)  # using 0.2 as in the paper, no need to expose every setting
        self.activation = activation
        # Probably not the nicest design but I use the same module in 3 locations, before/after features projection
        # and for attention coefficients. Functionality-wise it's the same as using independent modules.
        self.dropout = nn.Dropout(p=dropout_prob)

        self.reset_parameter()
        
    def forward(self, data):
        #
        # Step 1: Linear Projection + regularization
        #

        in_nodes_features, edge_index, initial_features = data  # unpack data
        if initial_features is None and (self.residual_connection or self.random_walk_with_restart):
            initial_features = in_nodes_features
            initial_features = self.W_residual(in_nodes_features).reshape(-1, self.num_of_heads, self.num_out_features)
        num_of_nodes = in_nodes_features.shape[self.nodes_dim]
        assert edge_index.shape[0] == 2, f'Expected edge index with shape=(2,E) got {edge_index.shape}'

        # shape = (노드 수, 입력 특징 수) : 각 노드의 입력 특징을 나타낸다.
        # 논문에서와 같이 모든 입력 노드 특징에 dropout을 적용한다.
        # Note: for Cora features are already super sparse so it's questionable how much this actually helps
        in_nodes_features = self.dropout(in_nodes_features) # 공식 GAT 구현에서도 dropout을 사용.

        # shape = (노드 수, 입력 특징 수) * (입력 특징 수, 헤드 수 * 출력 특징 수) -> (노드 수, 헤드 수, 출력 특징 수)
        # We project the input node features into NH independent output features (one for each attention head)
        # 즉 각 노드의 특징을 각 헤드마다 다른 특징으로 변환한다.
        nodes_features_proj = self.linear_proj(in_nodes_features).reshape(-1, self.num_of_heads, self.num_out_features)

        nodes_features_proj = self.dropout(nodes_features_proj) # 공식 GAT 구현에서도 dropout을 사용.

        #
        # Step 2: Edge attention calculation
        #

        # Apply the scoring function (* represents element-wise (a.k.a. Hadamard) product)
        # shape = (노드 수, 헤드 수, 출력 특징 수) * (1, 헤드 수, 출력 특징 수) -> (노드 수, 헤드 수, 1) -> (노드 수, 헤드 수)
        # sum은 마지막 차원을 기준으로 sum하므로 (N, NH, FOUT) -> (N, NH) 즉, 노드 수 x 헤드 수
        # 여기서 학습 가능한 파라미터인 scoring_fn_source, scoring_fn_target을 사용하여 각 노드의 특징을 계산한다.
        # [GAT에선 a 벡터를 사용하여 두 노드의 특징을 결합한 후, scoring function을 통해 attention score를 계산한다.]
        scores_source = (nodes_features_proj * self.scoring_fn_source).sum(dim=-1) # 마지막 차원을 기준으로 sum 즉, (N, NH, FOUT) -> (N, NH) 즉, 노드 수 x 헤드 수
        scores_target = (nodes_features_proj * self.scoring_fn_target).sum(dim=-1) # (N, NH) 즉, 노드 수 x 헤드 수
        
        # We simply copy (lift) the scores for source/target nodes based on the edge index. Instead of preparing all
        # the possible combinations of scores we just prepare those that will actually be used and those are defined
        # by the edge index.
        # scores shape = (E, NH), nodes_features_proj_lifted shape = (E, NH, FOUT), E - number of edges in the graph
        # 각 엣지에 대한 source, target의 attention score를 계산한다.
        scores_source_lifted, scores_target_lifted, nodes_features_proj_lifted = self.lift(scores_source, scores_target, nodes_features_proj, edge_index)
        scores_per_edge = self.leakyReLU(scores_source_lifted + scores_target_lifted)

        # shape = (엣지 수, 헤드 수, 1) -> (엣지 수, 헤드 수, 1) (unsqueeze를 통해 차원을 추가한다. 그래야 element-wise 곱을 할 수 있다.)
        # 이제 softmax를 통해 attention coefficient를 계산한다.
        attentions_per_edge = self.neighborhood_aware_softmax(scores_per_edge, edge_index[1], num_of_nodes)
        # Add stochasticity to neighborhood aggregation
        attentions_per_edge = self.dropout(attentions_per_edge)

        #
        # Step 3: Neighborhood aggregation
        #

        # Element-wise (aka Hadamard) product. Operator * does the same thing as torch.mul
        # shape = (엣지 수, 헤드 수, 출력 특징 수) * (엣지 수, 헤드 수, 1) -> (엣지 수, 헤드 수, 출력 특징 수) 1이 FOUT으로 브로드캐스팅된다.
        # FOUT은 출력 특징 수이다. 즉, 각 엣지의 attention score를 이용하여 각 노드의 특징을 가중합하여 계산한다.
        nodes_features_proj_lifted_weighted = nodes_features_proj_lifted * attentions_per_edge

        # 이 부분은 각 노드의 이웃 노드의 특징을 가중합하여 계산한다.
        # shape = (노드 수, 헤드 수, 출력 특징 수)
        out_nodes_features = self.aggregate_neighbors(initial_features, nodes_features_proj_lifted_weighted, edge_index, in_nodes_features, num_of_nodes)

        #
        # Step 4: Residual/skip connections, concat and bias
        #

        out_nodes_features = self.skip_concat_bias(out_nodes_features, initial_features)

        return (out_nodes_features, edge_index, initial_features)
    
    def reset_parameter(self):
        """
        원래 논문에서 GAT을 구현한 코드가 TensorFlow로 되어있고, 그 코드에서는 기본 초기화 방법으로 사용했기 때문에
        Glorot (Xavier uniform) initialization을 사용한다.
        
        Tensorflow의 기본 초기화 방법은 Glorot (Xavier uniform) initialization이다.
        https://stackoverflow.com/questions/37350131/what-is-the-default-variable-initializer-in-tensorflow

        """
        nn.init.xavier_uniform_(self.linear_proj.weight)
        nn.init.xavier_uniform_(self.scoring_fn_target)
        nn.init.xavier_uniform_(self.scoring_fn_source)
        if self.residual_connection or self.random_walk_with_restart:
            nn.init.xavier_uniform_(self.W_residual.weight)

        if self.bias is not None:
            torch.nn.init.zeros_(self.bias)

    def lift(self, scores_source, scores_target, nodes_features_matrix_proj, edge_index):
        """
        Lifts i.e. duplicates certain vectors depending on the edge index.
        One of the tensor dims goes from N -> E (that's where the "lift" comes from).
        즉 lifts는 edge index에 따라 특정 벡터를 엣지 수만큼 복제한다.
        텐서의 차원 중 하나가 N -> E로 변한다.
        여기서 N은 노드 수, E는 엣지 수이다.

        """
        # src_nodes_index :  tensor([   0,    0,    0,  ..., 2707, 2707, 2707])
        # trg_nodes_index :  tensor([ 633, 1862, 2582,  ...,  598, 1473, 2706])
        src_nodes_index = edge_index[0]
        trg_nodes_index = edge_index[1]

        # scores_source shape before :  torch.Size([2708, 8])
        scores_source = scores_source.index_select(self.nodes_dim, src_nodes_index)
        # scores_source shape after :  torch.Size([10556, 8])
        scores_target = scores_target.index_select(self.nodes_dim, trg_nodes_index)
        # nodes_features_proj shape torch.Size([2708, 8, 8])
        nodes_features_matrix_proj_lifted = nodes_features_matrix_proj.index_select(self.nodes_dim, src_nodes_index)
        # nodes_features_matrix_proj_lifted shape :  torch.Size([10556, 8, 8])

        return scores_source, scores_target, nodes_features_matrix_proj_lifted

    def neighborhood_aware_softmax(self, scores_per_edge, trg_index, num_of_nodes):
        """
        이웃 노드들의 attention score를 softmax를 통해 계산한다.
        """

        # Calculate the numerator. Make logits <= 0 so that e^logit <= 1 (this will improve the numerical stability)
        # https://stats.stackexchange.com/questions/338285/how-does-the-subtraction-of-the-logit-maximum-improve-learning
        scores_per_edge = scores_per_edge - scores_per_edge.max()
        exp_scores_per_edge = scores_per_edge.exp()

        # Calculate the denominator. shape = (E, NH)
        neigborhood_aware_denominator = self.sum_edge_scores_neighborhood_aware(exp_scores_per_edge, trg_index, num_of_nodes)

        # 1e-16은 이론적으로 필요하지 않지만 수치적 안정성을 위해 (0으로 나누는 것을 피하기 위해) 추가했다.
        attentions_per_edge = exp_scores_per_edge / (neigborhood_aware_denominator + 1e-16)
        # shape = (E, NH) -> (E, NH, 1)로 만들어서 projected node features와 element-wise 곱을 할 수 있게 한다.

        return attentions_per_edge.unsqueeze(-1)
    
    def sum_edge_scores_neighborhood_aware(self, exp_scores_per_edge, trg_index, num_of_nodes):
        # attention head 개수만큼 브로드캐스팅한다. 여기서 브로드캐스팅이란 차원을 늘려서 연산을 수행하는 것을 의미한다.
        # E -> (E, NH)
        trg_index_broadcasted = self.explicit_broadcast(trg_index, exp_scores_per_edge)

        # shape = (N, NH)
        size = list(exp_scores_per_edge.shape)  # convert to list otherwise assignment is not possible
        size[self.nodes_dim] = num_of_nodes
        neighborhood_sums = torch.zeros(size, dtype=exp_scores_per_edge.dtype, device=exp_scores_per_edge.device)

        # exp_scores_per_edge를 trg_index_broadcasted의 값을 index로 사용하여 neighborhood_sums에 더한다.
        # 그렇게 되면 각 노드의 이웃 노드들의 attention score의 합을 계산할 수 있다.
        neighborhood_sums.scatter_add_(self.nodes_dim, trg_index_broadcasted, exp_scores_per_edge)

        # edge_index의 차원에 맞게 브로드캐스팅한다.
        # 모든 location의 값이 i번째 노드의 attention score의 합으로 브로드캐스팅되는 것이다.
        # shape = (N, NH) -> (E, NH)
        return neighborhood_sums.index_select(self.nodes_dim, trg_index)

    def aggregate_neighbors(self, initial_features, nodes_features_proj_lifted_weighted, edge_index, in_nodes_features, num_of_nodes):
        # shape = (E, NH, FOUT) -> (N, NH, FOUT)
        out_nodes_features = torch.zeros(num_of_nodes, *nodes_features_proj_lifted_weighted.shape[1:], dtype=in_nodes_features.dtype, device=in_nodes_features.device)

        # shape = (E) -> (E, NH, FOUT)
        trg_index_broadcasted = self.explicit_broadcast(edge_index[1], nodes_features_proj_lifted_weighted)
        # aggregation step - we accumulate projected, weighted node features for all the attention heads
        # shape = (E, NH, FOUT) -> (N, NH, FOUT)
        out_nodes_features.scatter_add_(self.nodes_dim, trg_index_broadcasted, nodes_features_proj_lifted_weighted)

        # Random walk with Restart
        if self.random_walk_with_restart:
            out_nodes_features = (1 - self.gamma) * out_nodes_features + self.gamma * initial_features

        return out_nodes_features

    def explicit_broadcast(self, this, other):
        # 차원이 같아질 때까지 singleton 차원을 추가한다.
        for _ in range(this.dim(), other.dim()):
            this = this.unsqueeze(-1) # 가장 마지막 차원을 추가한다.

        # other 텐서와 같은 모양으로 확장하는데, 이 때 실제로 데이터를 복사하지는 않고, 필요에 따라 가상적으로 차원을 확장한다.
        return this.expand_as(other)
    
    def skip_concat_bias(self, out_nodes_features, initial_features):
        if self.residual_connection:  # add residual connection
            if out_nodes_features.shape[-1] == initial_features.shape[-1]:  # if FIN == FOUT
                out_nodes_features += initial_features
        if self.concat:
            # shape = (N, NH, FOUT) -> (N, NH*FOUT)
            out_nodes_features = out_nodes_features.reshape(-1, self.num_of_heads * self.num_out_features)
        else:
            # shape = (N, NH, FOUT) -> (N, FOUT)
            out_nodes_features = out_nodes_features.mean(dim=self.head_dim)

        if self.bias is not None:
            out_nodes_features += self.bias

        return out_nodes_features if self.activation is None else self.activation(out_nodes_features)

## Network

In [6]:
"""
Reference: GRAPH ATTENTION NETWORKS (2018).

https://github.com/PetarV-/GAT
https://github.com/gordicaleksa/pytorch-GAT
"""

import torch.nn as nn

class GATWithIRCRWR(nn.Module):
    def __init__(self, num_of_additional_layer, num_in_features, num_classes, random_walk_with_restart=True, add_residual_connection=True, bias=True, dropout=0.6):
        super().__init__()

        additional_layers = []

        for _ in range(num_of_additional_layer):
            additional_layers.append(
                    GATLayerWithIRCRWR(
                    num_in_features=8*8,  # consequence of concatenation
                    num_out_features=8,
                    num_of_heads=8,
                    concat=True,
                    activation=nn.ELU(),
                    dropout_prob=dropout,
                    random_walk_with_restart=random_walk_with_restart,
                    add_residual_connection=add_residual_connection,
                    bias=bias
                ),
            )

        self.gat_net = nn.Sequential(
            GATLayerWithIRCRWR(
                num_in_features=num_in_features,  # consequence of concatenation
                num_out_features=8,
                num_of_heads=8,
                concat=True,
                activation=nn.ELU(),
                dropout_prob=dropout,
                random_walk_with_restart=random_walk_with_restart,
                add_residual_connection=add_residual_connection,
                bias=bias
            ),
            *additional_layers,
            GATLayerWithIRCRWR(
                num_in_features=8 * 8,  # consequence of concatenation
                num_out_features=num_classes,
                num_of_heads=1,
                concat=False,  # last GAT layer does mean avg, the others do concat
                activation=None,  # last layer just outputs raw scores
                dropout_prob=dropout,
                random_walk_with_restart=False,
                add_residual_connection=False,
                bias=bias
            )
        )

    # data is just a (in_nodes_features, edge_index) tuple, I had to do it like this because of the nn.Sequential:
    # https://discuss.pytorch.org/t/forward-takes-2-positional-arguments-but-3-were-given-for-nn-sqeuential-with-linear-layers/65698
    def forward(self, data):
        data = data + (None,)
        return self.gat_net(data)

## Training

In [9]:
import enum
import argparse

import torch
import torch.nn as nn
from torch.optim import Adam

# 3 different model training/eval phases used in train.py
class LoopPhase(enum.Enum):
    TRAIN = 0,
    VAL = 1,
    TEST = 2

# Global vars used for early stopping. After some number of epochs (as defined by the patience_period var) without any
# improvement on the validation dataset (measured via accuracy metric), we'll break out from the training loop.
BEST_VAL_ACC = 0
BEST_VAL_LOSS = 0


def get_training_args(time_start, dataset, train_range, val_range, test_range, num_input_features, num_classes, random_walk_with_restart, add_residual_connection, num_of_additional_layer=0):
    parser = argparse.ArgumentParser()

    # Training related
    parser.add_argument("--num_of_epochs", type=int, help="number of training epochs", default=10000)
    parser.add_argument("--patience_period", type=int, help="number of epochs with no improvement on val before terminating", default=1000)
    parser.add_argument("--lr", type=float, help="model learning rate", default=5e-3)
    parser.add_argument("--weight_decay", type=float, help="L2 regularization on model weights", default=5e-4)
    parser.add_argument("--should_test", type=bool, help='should test the model on the test dataset?', default=True)

    # Dataset related
    # parser.add_argument("--dataset_name", choices=[el.name for el in DatasetType], help='dataset to use for training', default=DatasetType.CORA.name)
    parser.add_argument("--should_visualize", type=bool, help='should visualize the dataset?', default=False)

    # Logging/debugging/checkpoint related (helps a lot with experimentation)
    parser.add_argument("--enable_tensorboard", type=bool, help="enable tensorboard logging", default=False)
    parser.add_argument("--console_log_freq", type=int, help="log to output console (epoch) freq (None for no logging)", default=100)
    parser.add_argument("--checkpoint_freq", type=int, help="checkpoint model saving (epoch) freq (None for no logging)", default=1000)
    args = parser.parse_args("")

    # Model architecture related - this is the architecture as defined in the official paper (for Cora classification)
    gat_config = {
        "num_of_layers": 2,  # GNNs, contrary to CNNs, are often shallow (it ultimately depends on the graph properties)
        "num_heads_per_layer": [8, 1],
        "num_features_per_layer": [num_input_features, 8, num_classes],
        "add_skip_connection": False,  # hurts perf on Cora
        "bias": True,  # result is not so sensitive to bias
        "dropout": 0.6,  # result is sensitive to dropout
        "dataset": dataset,
        "time_start": time_start,
        "train_range": train_range,
        "val_range": val_range,
        "test_range": test_range,
        "random_walk_with_restart": random_walk_with_restart,
        "add_residual_connection": add_residual_connection,
        "num_of_additional_layer": num_of_additional_layer
    }

    # Wrapping training configuration into a dictionary
    training_config = dict()
    for arg in vars(args):
        training_config[arg] = getattr(args, arg)

    # Add additional config information
    training_config.update(gat_config)

    return training_config

import time


def train_gat(config, save_to_file, filename):
    global BEST_VAL_ACC, BEST_VAL_LOSS

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # checking whether you have a GPU, I hope so!

    # Step 1: load the graph data
    # node_features, node_labels, edge_index, train_indices, val_indices, test_indices = load_graph_data(config, device)
    # Step 1: load the graph data
    node_features = config["dataset"][0].x.to(device)
    node_labels = config["dataset"][0].y.to(device)
    edge_index = config["dataset"][0].edge_index.to(device)

    # Indices that help us extract nodes that belong to the train/val and test splits
    train_indices = torch.arange(config["train_range"][0], config["train_range"][1], dtype=torch.long, device=device)
    val_indices = torch.arange(config["val_range"][0], config["val_range"][1], dtype=torch.long, device=device)
    test_indices = torch.arange(config["test_range"][0], config["test_range"][1], dtype=torch.long, device=device)

    # Step 2: prepare the model
    gat = GATWithIRCRWR(config["num_of_additional_layer"], config["num_features_per_layer"][0], config["num_features_per_layer"][-1], random_walk_with_restart=config["random_walk_with_restart"], add_residual_connection=config["add_residual_connection"], bias=True, dropout=0.6).to(device)
    # gat = GAT(1433, 7, add_skip_connection=False, bias=True, dropout=0.6).to(device)

    # Step 3: Prepare other training related utilities (loss & optimizer and decorator function)
    loss_fn = nn.CrossEntropyLoss(reduction='mean')
    optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    # THIS IS THE CORE OF THE TRAINING (we'll define it in a minute)
    # The decorator function makes things cleaner since there is a lot of redundancy between the train and val loops
    main_loop = get_main_loop(
        config,
        gat,
        loss_fn,
        optimizer,
        node_features,
        node_labels,
        edge_index,
        train_indices,
        val_indices,
        test_indices,
        config['patience_period'],
        time.time())

    BEST_VAL_ACC, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0]  # reset vars used for early stopping

    # Step 4: Start the training procedure
    for epoch in range(config['num_of_epochs']):
        # Training loop
        main_loop(phase=LoopPhase.TRAIN, epoch=epoch, save_to_file=save_to_file, filename=filename)

        # Validation loop
        with torch.no_grad():
            try:
                main_loop(phase=LoopPhase.VAL, epoch=epoch, save_to_file=save_to_file, filename=filename)
            except Exception as e:  # "patience has run out" exception :O
                print(str(e))
                break  # break out from the training loop

    # Step 5: Potentially test your model
    # Don't overfit to the test dataset - only when you've fine-tuned your model on the validation dataset should you
    # report your final loss and accuracy on the test dataset. Friends don't let friends overfit to the test data. <3
    if config['should_test']:
        test_acc = main_loop(phase=LoopPhase.TEST)
        config['test_acc'] = test_acc
        save_to_file(filename, f'Test accuracy = {test_acc}\n')
        print(f'Test accuracy = {test_acc}')
    else:
        config['test_acc'] = -1

    # Save the latest GAT in the binaries directory
    # torch.save(get_training_state(config, gat), os.path.join(BINARIES_PATH, get_available_binary_name()))

# Simple decorator function so that I don't have to pass arguments that don't change from epoch to epoch
def get_main_loop(config, gat, cross_entropy_loss, optimizer, node_features, node_labels, edge_index, train_indices, val_indices, test_indices, patience_period, time_start):

    node_dim = 0  # this will likely change as soon as I add an inductive example (Cora is transductive)

    train_labels = node_labels.index_select(node_dim, train_indices)
    val_labels = node_labels.index_select(node_dim, val_indices)
    test_labels = node_labels.index_select(node_dim, test_indices)

    # node_features shape = (N, FIN), edge_index shape = (2, E)
    graph_data = (node_features, edge_index)  # I pack data into tuples because GAT uses nn.Sequential which requires it

    def get_node_indices(phase):
        if phase == LoopPhase.TRAIN:
            return train_indices
        elif phase == LoopPhase.VAL:
            return val_indices
        else:
            return test_indices

    def get_node_labels(phase):
        if phase == LoopPhase.TRAIN:
            return train_labels
        elif phase == LoopPhase.VAL:
            return val_labels
        else:
            return test_labels

    def main_loop(phase, epoch=0, save_to_file=None, filename=None):
        global BEST_VAL_ACC, BEST_VAL_LOSS, PATIENCE_CNT, writer

        # Certain modules behave differently depending on whether we're training the model or not.
        # e.g. nn.Dropout - we only want to drop model weights during the training.
        if phase == LoopPhase.TRAIN:
            gat.train()
        else:
            gat.eval()

        node_indices = get_node_indices(phase)
        gt_node_labels = get_node_labels(phase)  # gt stands for ground truth

        # Do a forwards pass and extract only the relevant node scores (train/val or test ones)
        # Note: [0] just extracts the node_features part of the data (index 1 contains the edge_index)
        # shape = (N, C) where N is the number of nodes in the split (train/val/test) and C is the number of classes
        nodes_unnormalized_scores = gat(graph_data)[0].index_select(node_dim, node_indices)

        # Example: let's take an output for a single node on Cora - it's a vector of size 7 and it contains unnormalized
        # scores like: V = [-1.393,  3.0765, -2.4445,  9.6219,  2.1658, -5.5243, -4.6247]
        # What PyTorch's cross entropy loss does is for every such vector it first applies a softmax, and so we'll
        # have the V transformed into: [1.6421e-05, 1.4338e-03, 5.7378e-06, 0.99797, 5.7673e-04, 2.6376e-07, 6.4848e-07]
        # secondly, whatever the correct class is (say it's 3), it will then take the element at position 3,
        # 0.99797 in this case, and the loss will be -log(0.99797). It does this for every node and applies a mean.
        # You can see that as the probability of the correct class for most nodes approaches 1 we get to 0 loss! <3
        loss = cross_entropy_loss(nodes_unnormalized_scores, gt_node_labels)

        if phase == LoopPhase.TRAIN:
            optimizer.zero_grad()  # clean the trainable weights gradients in the computational graph (.grad fields)
            loss.backward()  # compute the gradients for every trainable weight in the computational graph
            optimizer.step()  # apply the gradients to weights

        # Finds the index of maximum (unnormalized) score for every node and that's the class prediction for that node.
        # Compare those to true (ground truth) labels and find the fraction of correct predictions -> accuracy metric.
        class_predictions = torch.argmax(nodes_unnormalized_scores, dim=-1)
        accuracy = torch.sum(torch.eq(class_predictions, gt_node_labels).long()).item() / len(gt_node_labels)

        #
        # Logging
        #

        if phase == LoopPhase.TRAIN:
            # Log metrics
            if config['enable_tensorboard']:
                writer.add_scalar('training_loss', loss.item(), epoch)
                writer.add_scalar('training_acc', accuracy, epoch)

            # Save model checkpoint
            if config['checkpoint_freq'] is not None and (epoch + 1) % config['checkpoint_freq'] == 0:
                ckpt_model_name = f"gat_ckpt_epoch_{epoch + 1}.pth"
                config['test_acc'] = -1
                # torch.save(get_training_state(config, gat), os.path.join(CHECKPOINTS_PATH, ckpt_model_name))

        elif phase == LoopPhase.VAL:
            # Log metrics
            if config['enable_tensorboard']:
                writer.add_scalar('val_loss', loss.item(), epoch)
                writer.add_scalar('val_acc', accuracy, epoch)

            # Log to console
            if config['console_log_freq'] is not None and epoch % config['console_log_freq'] == 0:
                save_to_file(filename, f'GAT training: time elapsed= {(time.time() - time_start):.2f} [s] | epoch={epoch + 1} | val acc={accuracy}\n')
                print(f'GAT training: time elapsed= {(time.time() - time_start):.2f} [s] | epoch={epoch + 1} | val acc={accuracy}')

            # The "patience" logic - should we break out from the training loop? If either validation acc keeps going up
            # or the val loss keeps going down we won't stop
            if accuracy > BEST_VAL_ACC or loss.item() < BEST_VAL_LOSS:
                BEST_VAL_ACC = max(accuracy, BEST_VAL_ACC)  # keep track of the best validation accuracy so far
                BEST_VAL_LOSS = min(loss.item(), BEST_VAL_LOSS)
                PATIENCE_CNT = 0  # reset the counter every time we encounter new best accuracy
            else:
                PATIENCE_CNT += 1  # otherwise keep counting

            if PATIENCE_CNT >= patience_period:
                raise Exception('Stopping the training, the universe has no more patience for this training.')

        else:
            return accuracy  # in the case of test phase we just report back the test accuracy

    return main_loop  # return the decorated function

### Cora

In [55]:
import time
from torch_geometric.datasets import Planetoid

#
# Cora specific constants
#

# Thomas Kipf et al. first used this split in GCN paper and later Petar Veličković et al. in GAT paper
CORA_TRAIN_RANGE = [0, 140]  # we're using the first 140 nodes as the training nodes
CORA_VAL_RANGE = [140, 140+500]
CORA_TEST_RANGE = [1708, 1708+1000]
CORA_NUM_INPUT_FEATURES = 1433
CORA_NUM_CLASSES = 7

if __name__ == '__main__':
    print('Cora')
    dataset = Planetoid(root='./data/Cora', name='Cora')
    for i in range(4):
        cases = [[False, False, "GAT"], [True, False, "GAT with Random walk with restart"], [False, True, "GAT with Initial residual connection"], [True, True, "GAT with Random walk with restart and Initial residual connection"]]
        for case in cases:
            print(i+2, " layers ", case[-1])
            time_start = time.time()
            train_gat(get_training_args(time_start, dataset, CORA_TRAIN_RANGE, CORA_VAL_RANGE, CORA_TEST_RANGE, CORA_NUM_INPUT_FEATURES, CORA_NUM_CLASSES, random_walk_with_restart=case[0], add_residual_connection=case[1], num_of_additional_layer=i))
            # train_gat(time_start, dataset, CORA_TRAIN_RANGE, CORA_VAL_RANGE, CORA_TEST_RANGE, CORA_NUM_INPUT_FEATURES, CORA_NUM_CLASSES)
            print(f'Total training time: {(time.time() - time_start):.2f} [s]')

Cora
2  layers  GAT
GAT training: time elapsed= 0.01 [s] | epoch=1 | val acc=0.29
GAT training: time elapsed= 0.90 [s] | epoch=101 | val acc=0.802
GAT training: time elapsed= 1.81 [s] | epoch=201 | val acc=0.776
GAT training: time elapsed= 2.72 [s] | epoch=301 | val acc=0.8
GAT training: time elapsed= 3.63 [s] | epoch=401 | val acc=0.786
GAT training: time elapsed= 4.56 [s] | epoch=501 | val acc=0.782
GAT training: time elapsed= 5.46 [s] | epoch=601 | val acc=0.774
GAT training: time elapsed= 6.34 [s] | epoch=701 | val acc=0.788
GAT training: time elapsed= 7.23 [s] | epoch=801 | val acc=0.786
GAT training: time elapsed= 8.14 [s] | epoch=901 | val acc=0.796
GAT training: time elapsed= 9.04 [s] | epoch=1001 | val acc=0.784
GAT training: time elapsed= 9.95 [s] | epoch=1101 | val acc=0.786
Stopping the training, the universe has no more patience for this training.
Test accuracy = 0.793
Total training time: 9.99 [s]
2  layers  GAT with Random walk with restart
GAT training: time elapsed= 0.

KeyboardInterrupt: 

In [10]:
import time
from datetime import datetime
from torch_geometric.datasets import Planetoid

#
# Cora specific constants
#

# Thomas Kipf et al. first used this split in GCN paper and later Petar Veličković et al. in GAT paper
CORA_TRAIN_RANGE = [0, 140]  # we're using the first 140 nodes as the training nodes
CORA_VAL_RANGE = [140, 140+500]
CORA_TEST_RANGE = [1708, 1708+1000]
CORA_NUM_INPUT_FEATURES = 1433
CORA_NUM_CLASSES = 7

def save_to_file(filename, content):
    with open(filename, 'a') as f:
        f.write(content)

if __name__ == '__main__':
    filename = f'{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_cora_test.txt'
    save_to_file(filename, 'Cora\n')
    dataset = Planetoid(root='./data/Cora', name='Cora')
    time_start = time.time()
    for i in range(4):
        cases = [[False, False, "GAT"], [True, False, "GAT with Random walk with restart"], [False, True, "GAT with Initial residual connection"], [True, True, "GAT with Random walk with restart and Initial residual connection"]]
        for case in cases:
            content = f"{i+2} layers {case[-1]}\n"
            save_to_file(filename, content)
            time_start = time.time()
            content = train_gat(get_training_args(time_start, dataset, CORA_TRAIN_RANGE, CORA_VAL_RANGE, CORA_TEST_RANGE, CORA_NUM_INPUT_FEATURES, CORA_NUM_CLASSES, random_walk_with_restart=case[0], add_residual_connection=case[1], num_of_additional_layer=i), save_to_file, filename)
            save_to_file(filename, f'Total training time: {(time.time() - time_start):.2f} [s]\n')

    save_to_file(filename, f'\n\nTotal training time for Full Process: {(time.time() - time_start):.2f} [s]\n')

GAT training: time elapsed= 0.01 [s] | epoch=1 | val acc=0.132
GAT training: time elapsed= 0.93 [s] | epoch=101 | val acc=0.784
GAT training: time elapsed= 1.90 [s] | epoch=201 | val acc=0.782
GAT training: time elapsed= 2.76 [s] | epoch=301 | val acc=0.796
GAT training: time elapsed= 3.72 [s] | epoch=401 | val acc=0.778
GAT training: time elapsed= 4.60 [s] | epoch=501 | val acc=0.784
GAT training: time elapsed= 5.54 [s] | epoch=601 | val acc=0.782
GAT training: time elapsed= 6.42 [s] | epoch=701 | val acc=0.784
GAT training: time elapsed= 7.33 [s] | epoch=801 | val acc=0.778
GAT training: time elapsed= 8.24 [s] | epoch=901 | val acc=0.794
GAT training: time elapsed= 9.20 [s] | epoch=1001 | val acc=0.782
GAT training: time elapsed= 10.14 [s] | epoch=1101 | val acc=0.79
GAT training: time elapsed= 11.05 [s] | epoch=1201 | val acc=0.774
GAT training: time elapsed= 11.98 [s] | epoch=1301 | val acc=0.774
GAT training: time elapsed= 12.86 [s] | epoch=1401 | val acc=0.776
GAT training: time 

### Citeseer

### Pubmed

In [34]:
import time
from torch_geometric.datasets import Planetoid

#
# Pubmed specific constants
#

# Thomas Kipf et al. first used this split in GCN paper and later Petar Veličković et al. in GAT paper
PUBMED_TRAIN_RANGE = [0, 60]  # we're using the first 140 nodes as the training nodes
PUBMED_VAL_RANGE = [60, 60+500]
PUBMED_TEST_RANGE = [18717, 18717+1000]
PUBMED_NUM_INPUT_FEATURES = 500
PUBMED_NUM_CLASSES = 3

if __name__ == '__main__':
    print('Pubmed')
    dataset = Planetoid(root='./data/Pubmed', name='Pubmed')
    time_start = time.time()
    train_gat(get_training_args(time_start, dataset, PUBMED_TRAIN_RANGE, PUBMED_VAL_RANGE, PUBMED_TEST_RANGE, PUBMED_NUM_INPUT_FEATURES, PUBMED_NUM_CLASSES))
    # train_gat(time_start, dataset, PUBMED_TRAIN_RANGE, PUBMED_VAL_RANGE, PUBMED_TEST_RANGE, PUBMED_NUM_INPUT_FEATURES, PUBMED_NUM_CLASSES)
    print(f'Total training time: {(time.time() - time_start):.2f} [s]')

Pubmed
is here?
is here?
is here?
GAT training: time elapsed= 0.03 [s] | epoch=1 | val acc=0.27
GAT training: time elapsed= 1.81 [s] | epoch=101 | val acc=0.766
GAT training: time elapsed= 3.66 [s] | epoch=201 | val acc=0.76
GAT training: time elapsed= 5.46 [s] | epoch=301 | val acc=0.726
GAT training: time elapsed= 7.19 [s] | epoch=401 | val acc=0.786
GAT training: time elapsed= 9.04 [s] | epoch=501 | val acc=0.774
GAT training: time elapsed= 10.76 [s] | epoch=601 | val acc=0.766
GAT training: time elapsed= 12.66 [s] | epoch=701 | val acc=0.752
GAT training: time elapsed= 14.53 [s] | epoch=801 | val acc=0.756
GAT training: time elapsed= 16.42 [s] | epoch=901 | val acc=0.732
GAT training: time elapsed= 18.27 [s] | epoch=1001 | val acc=0.766
GAT training: time elapsed= 20.13 [s] | epoch=1101 | val acc=0.762
Stopping the training, the universe has no more patience for this training.
Test accuracy = 0.76
Total training time: 20.75 [s]
