Chapter 4 Example

Chapter 4 - Convolutional GNNs

This Notebook will apply GCN and GraphSage to the Amazon Products dataset (ogbn-products). The GNN library used is Pytorch Geometric (PyG).

This code is also in the repository: https://github.com/keitabroadwater/gnns_in_action. The repository code will be updated periodically.

I. Install packages, Load Data and Import Packages

II. Light EDA

III. Model Setup

IV. Training

(Note: because this dataset is 1.3GB, it may tax the memory of some machines. If that happens, I recommend to run part II, III and IV in separate sessions. Another technique is to use deletion and garbage collection (del() and gc.collect(), respectively) when large variables are no longer in use.)

Acknowledgements

-------------------------------------------------------------------

# Part I. Install Packages, Load Data and Import Packages

In [None]:
# Find the CUDA version PyTorch was installed with
!python -c "import torch; print(torch.version.cuda)"

In [None]:
# PyTorch version
!python -c "import torch; print(torch.__version__)"

In [None]:
%%capture
!pip install ogb pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.13.0+cu116.html
!pip install torch-geometric

In [None]:
import torch
import torch.nn.functional as F
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.data import NeighborSampler
from torch_geometric.nn import SAGEConv, GCNConv
from torch_geometric import utils, loader


# importing obg datatset
from ogb.nodeproppred import PygNodePropPredDataset
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

from pandas.core.common import flatten
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(16.7,8.27)})
sns.set_theme(style="ticks")

import collections
import os.path as osp
import pandas as pd
import numpy as np
import collections
from pandas.core.common import flatten
from scipy.special import softmax
import gc
from tqdm import tqdm

Download and load the Amazon products dataset. To save processing time and space, we will use the Sparse Tensor form when processing the data. 

For more information about the use of sparse tensors in PyG, see: https://pytorch-geometric.readthedocs.io/en/latest/notes/sparse_tensor.html

In [None]:
# download and loading the obg dataset
root = osp.join(osp.dirname(osp.realpath('./')), 'data', 'products')
dataset = PygNodePropPredDataset( name='ogbn-products', transform=T.ToSparseTensor())

In [None]:
# Load the OGB evaluator for the dataset
evaluator = Evaluator(name='ogbn-products')

# Establish the device for model training 'cuda' if GPU, 'cpu' otherwise
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

# Confirm the device. If it's a GPU, 'cuda' will print
print('Device: {}'.format(device))


# Part II. Light EDA (exploratory data analysis)

(For the EDA, I am using a Tensor form of the data, rather than the SparseTensor form used in the model training.)

For the exploratory data analysis, we will:


1.   Get basic stats for nodes, edges, and features
2.   Examine the category labels, and their distribution in the dataset

---------------------------------------------------------------


1. Get Basic Stats

In [None]:
dataset_for_eda = PygNodePropPredDataset('ogbn-products', root)
data_for_eda = dataset_for_eda[0]
len(data_for_eda)

In [None]:
# Basic graph statistics of the ogbn-product graph
print("Number of nodes in the graph:", data_for_eda.num_nodes)
print("Number of edges in the graph:", data_for_eda.num_edges)
print("Node feature matrix with shape:", data_for_eda.x.shape) # [num_nodes, num_node_features]
print("Target:", data_for_eda.y.shape) 
print("Node feature length:", dataset_for_eda.num_features)

2. Examine labels and their distribution

In [None]:
# checking the number of unique labels
# there are 47 unique categories of product
data_for_eda.y.unique()

In [None]:
# load integer to real product category from label mapping provided inside the dataset
labels_df = pd.read_csv('/content/dataset/ogbn_products/mapping/labelidx2productcategory.csv.gz')

In [None]:
# lets see some of the product categories
labels_df[:10]

In [None]:
# creating a dictionary of product category and corresponding integer label
label_idx, prod_cat = labels_df.iloc[: ,0].values, labels_df.iloc[: ,1].values
label_mapping = dict(zip(label_idx, prod_cat))


In [None]:
# counting the numbers of samples for each category
y = data_for_eda.y.tolist()
y = list(flatten(y))
count_y = collections.Counter(y)
print(count_y)

In [None]:
dict_labels = dict(count_y)
dict_labels

In [None]:
index_product_dict = dict(zip(labels_df['label idx'], labels_df['product category']))
index_product_dict

In [None]:
# 
products_hist = dict((index_product_dict[key], value) for (key, value) in dict(count_y).items())
category_df = pd.DataFrame(products_hist.items(), columns=['Category', 'Count'])
category_df = category_df.set_index('Category')
category_df = category_df.sort_values('Count')
category_df.plot(kind='barh')


In [None]:
category_df['Count'].mean(), category_df['Count'].median()

In [None]:
category_df.head(20)

# Part III. Model Setup

For the exploratory data analysis, we will:


1.   Setup GCN
2.   Setup GraphSage
3.   Setup the training routine



In [None]:
data = dataset[0]

1. Setup GCN



In [None]:
class GCN(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        super().__init__()
        # torch.manual_seed(2022)
        self.dropout = dropout
        self.conv1 = GCNConv(in_dim, hidden_dim, normalize=False)
        self.conv2 = GCNConv(hidden_dim, hidden_dim , normalize=False)
        self.conv3 = GCNConv(hidden_dim, out_dim , normalize=False)

    def forward(self, x, adj_t):
        # x, adj_t = data.x, data.adj_t

        x = self.conv1(x, adj_t)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout,  training=self.training)

        x = self.conv2(x, adj_t)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, adj_t)
        # x = F.relu(x)
        # x = F.dropout(x, p=self.dropout)

        return torch.log_softmax(x, dim=-1)

2. Setup GraphSage


In [None]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        super().__init__()
        self.dropout = dropout
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.conv3 = SAGEConv(hidden_dim, out_dim)
    
    def forward(self, x, adj_t):
        x = self.conv1(x, adj_t)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)
        
        x = self.conv2(x, adj_t)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)
        
        x = self.conv3(x, adj_t)
        # x = F.elu(x)
        # x = F.dropout(x, p=self.dropout)

        return torch.log_softmax(x, dim=-1)

3. Setup training routine

In [None]:
#
def train(model, data, train_idx, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data.x, data.adj_t)
    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    return train_acc, valid_acc, test_acc

In [None]:
data_cpu = dataset[0]
split_idx = dataset.get_idx_split()
train_idx = split_idx['train']

print('Number of training nodes:', split_idx['train'].size(0))
print('Number of validation nodes:', split_idx['valid'].size(0))
print('Number of test nodes:', split_idx['test'].size(0))

# Part IV. Model Training

For the exploratory data analysis, we will:


1.   Train GraphSage
2.   Train GCN


1. GraphSage

In [None]:
lr = .01
epochs = 100
hidden_dim = 90 #256 too high
evaluator = Evaluator(name='ogbn-products')

# model.reset_parameters()

model = GraphSAGE(in_dim=data_cpu.num_node_features, 
                 hidden_dim=hidden_dim, 
                 out_dim=dataset.num_classes).to(device)

data_cpu = data_cpu.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

result_list = []
for epoch in range(1, 1 + epochs):
    loss = train(model, data_cpu, train_idx, optimizer)
    result = test(model, data_cpu, split_idx, evaluator)
    #logger.add_result(run, result)

    if epoch % 10 == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch}/{epochs}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')
        
        result_list.append([epoch,train_acc, valid_acc, test_acc])

In [None]:
# result_list
x = [x[0] for x in result_list]
y = [x[1:] for x in result_list]

In [None]:
len(y[0])

In [None]:
import matplotlib.pyplot as plt
x = [x[0] for x in result_list]
y = [x[1:] for x in result_list]
plt.xlabel("Epoch")
plt.ylabel("Training/Val/Test Curve")
plt.title("Learning Curves")
for i in range(len(y[0])):
    # plt.plot(x[0][i],[pt[i] for pt in y],label = 'id %s'%i)

    plt.plot(x,[pt[i] for pt in y],label = 'id %s'%i)

plt.legend()
plt.show()

In [None]:
from torch_geometric.loader import DataLoader as DL

2. GCN

In [None]:
# torch.cuda.empty_cache()

lr = .01 #1e-4 
epochs = 300 
hidden_dim = 75
evaluator = Evaluator(name='ogbn-products')

# model.gnn_node.reset_parameters()
#       self.linear.reset_parameters()reset_parameters()

model = GCN(in_dim=data_cpu.num_node_features, 
                 hidden_dim=hidden_dim, 
                 out_dim=dataset.num_classes).to(device)

# Pre-compute GCN normalization.
adj_t = data_cpu.adj_t.set_diag()
deg = adj_t.sum(dim=1).to(torch.float)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
data_cpu.adj_t = adj_t

data_cpu = data_cpu.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(1, 1 + epochs):
    loss = train(model, data_cpu, train_idx, optimizer)
    result = test(model, data_cpu, split_idx, evaluator)
    #logger.add_result(run, result)

    if epoch % 10 == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch}/{epochs}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')