In [1]:
import torch
import os

In [2]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-2.0.1+cu118.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-2.0.1+cu118.html
!pip install torch-geometric
!pip install ogb

Looking in links: https://pytorch-geometric.com/whl/torch-2.0.1+cu118.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_scatter-2.1.2%2Bpt20cu118-cp310-cp310-linux_x86_64.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt20cu118
Looking in links: https://pytorch-geometric.com/whl/torch-2.0.1+cu118.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_sparse-0.6.18%2Bpt20cu118-cp310-cp310-linux_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt20cu118
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [9

In [3]:
from torch_geometric.datasets import TUDataset


root = "./enzymes"
name = "ENZYMES"

pyg_dataset = TUDataset(root, name)
print(pyg_dataset)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Processing...


ENZYMES(600)


Done!


In [4]:
num_classes = pyg_dataset.num_classes
num_features = pyg_dataset.num_features
num_node_features = pyg_dataset.num_node_features
num_edge_features = pyg_dataset.num_edge_features

num_classes, num_features, num_node_features, num_edge_features

(6, 3, 3, 0)

In [5]:
label = pyg_dataset[0].y.item()
pyg_dataset[0], label          #NB, each subgraph is a sample from a larger graph

(Data(edge_index=[2, 168], x=[37, 3], y=[1]), 5)

In [None]:
# batching and shuffling using torch_geometric.Dataloader
from torch_geometric.loader import DataLoader

loader = DataLoader(pyg_dataset, batch_size=32, shuffle=True)


##OGB Benchmark Datasets
* All graphs are heterogeneous
* All kinds of tasks can be performed using this (Node, Edge and Graph Level)

#####**NB:**For Edge prediction, We'll have to define a function for transductive or inductive split and appending the negative edges becasue homogeneous link prediction is a binary task. hetero is similar just that it is a multiclass pred.

#####For Graph Prediction, some graphs come with labels or features which could be used.

In [22]:
import torch_geometric.transforms as T
from ogb.nodeproppred import PygNodePropPredDataset

#read about dataset properties and task type online
dataset_name = "ogbn-arxiv"
dataset = PygNodePropPredDataset(name=dataset_name, transform=T.ToSparseTensor())
split_idx = dataset.get_idx_split()

split_idx



Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:05<00:00, 14.59it/s]


Extracting dataset/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 6864.65it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 870.01it/s]

Saving...



Done!


{'train': tensor([     0,      1,      2,  ..., 169145, 169148, 169251]),
 'valid': tensor([   349,    357,    366,  ..., 169185, 169261, 169296]),
 'test': tensor([   346,    398,    451,  ..., 169340, 169341, 169342])}

In [26]:
dataset.meta_info

(<bound method Dataset.get_summary of PygNodePropPredDataset()>,
 num tasks                                                                1
 num classes                                                             40
 eval metric                                                            acc
 task type                                        multiclass classification
 download_name                                                        arxiv
 version                                                                  1
 url                      http://snap.stanford.edu/ogb/data/nodeproppred...
 add_inverse_edge                                                     False
 has_node_attr                                                         True
 has_edge_attr                                                        False
 split                                                                 time
 additional node files                                            node_year
 additional edge files 

In [30]:
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

In [31]:
data = dataset[0]
data.keys()

['node_year', 'y', 'x', 'adj_t', 'num_nodes']

In [36]:
data.y[train_idx,0].shape, data.y[valid_idx].shape, data.y[test_idx].shape

(torch.Size([90941]), torch.Size([29799, 1]), torch.Size([48603, 1]))

In [None]:
#using a dataloader for batching and shuffling
from torch_geometric.loader import DataLoader, NeighborLoader

loader = DataLoader(dataset, batch_size=32, shuffle=True)
#the batch is really the number of nodes from the single graph and is randomly sampled
#This could lead to important neighbors not sampled

#use neighborhood loader for neighborhood sampling.

In [None]:
#creating your own dataset
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data, num_nodes, num_edges):
        self.data = data
        self.num_nodes = num_nodes
        self.num_edges = num_edges

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the node features, edge indices, and batch tensor for the current graph
        x = self.data[idx]['node_features']
        edge_index = self.data[idx]['edge_indices']
        batch = self.data[idx]['batch']

        # Create a Data object to store the graph data
        data = torch_geometric.data.Data(x=x, edge_index=edge_index, batch=batch)

        return data