# ogbg-code2

## Set Up

In [18]:
import torch
import os
import pandas as pd
import torch.nn.functional as F
print("PyTorch has version {}".format(torch.__version__))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: {}'.format(device))

# The PyG built-in GCNConv
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader

from ogb.graphproppred import PygGraphPropPredDataset
from ogb.graphproppred import Evaluator

from tqdm.notebook import tqdm

PyTorch has version 2.0.1+cu118
Device: cuda


## Args

In [19]:
args = {
      'device': device,
      'num_layers': 5,
      'hidden_dim': 256,
      'dropout': 0.5,
      'lr': 0.001,
      'epochs': 30,
  }
args

{'device': 'cuda',
 'num_layers': 5,
 'hidden_dim': 256,
 'dropout': 0.5,
 'lr': 0.001,
 'epochs': 30}

## Data loader

In [15]:
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.loader import DataLoader

dataset = PygGraphPropPredDataset(name = "ogbg-code2") 

split_idx = dataset.get_idx_split() 
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False)
print('Task type: {}'.format(dataset.task_type))

Task type: subtoken prediction


## Performance Evaluator

In [16]:
from ogb.graphproppred import Evaluator

evaluator = Evaluator(name = "ogbg-code2")
print(evaluator.expected_input_format) 
print(evaluator.expected_output_format)
# In most cases, input_dict is
# input_dict = {"y_true": y_true, "y_pred": y_pred}
# result_dict = evaluator.eval(input_dict)

==== Expected input format of Evaluator for ogbg-code2
{'seq_ref': seq_ref, 'seq_pred': seq_pred}
- seq_ref: a list of lists of strings
- seq_pred: a list of lists of strings
where seq_ref stores the reference sequences of sub-tokens, and
seq_pred stores the predicted sequences of sub-tokens.

==== Expected output format of Evaluator for ogbg-code2
{'F1': F1}
- F1 (float): F1 score averaged over samples.



## GCN Model

In [17]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout, return_embeds=True) :
        super(GCN, self).__init__()

        # a list of GCNConv layers
        self.convs = torch.nn.ModuleList()
        # a list of 1D batch normaliztion layers
        self.bns = torch.nn.ModuleList()
        for i in range(num_layers):
            if i == 0:
                self.convs.append(GCNConv(input_dim, hidden_dim))
            elif i == num_layers - 1:
                self.convs.append(GCNConv(hidden_dim, output_dim))
            else: 
                self.convs.append(GCNConv(hidden_dim, hidden_dim))
            if i < num_layers - 1:
                self.bns.append(torch.nn.BatchNorm1d(hidden_dim)) 
        # 1D softmax 
        self.softmax = torch.nn.LogSoftmax(dim=1)
        # Probability of an element getting zeroed
        self.dropout = dropout
        # Skip classification layer and return node embeddings
        self.return_embeds = return_embeds
    
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, edge_index):
        for i in range(len(self.convs)):
            x =self.convs[i](x, edge_index)
            if i < len(self.convs) - 1:
                x = self.bns[i](x) 
                x = F.relu(x)
                x = F.dropout(x, p = self.dropout, training=self.training)
        if(self.return_embeds == True):
            out = x
        else:
            out = self.softmax(x) 
        return out

## GCN_Graph Model

In [None]:
class GCN_Graph(torch.nn.Module):
    def __init__(self, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph, self).__init__()
        

