# Introduction By Example

https://pytorch-geometric.readthedocs.io/en/latest/get_started/introduction.html


## Data Handling of Graphs


In [1]:
import torch
from torch_geometric.data import Data

In [2]:
edge_index = torch.tensor([[0, 1, 1, 2], [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
data

Data(x=[3, 1], edge_index=[2, 4])

In [3]:
edge_index = torch.tensor([[0, 1], [1, 0], [1, 2], [2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index.t().contiguous())
data

Data(x=[3, 1], edge_index=[2, 4])

In [4]:
edge_index.T

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [5]:
edge_index.T.contiguous()

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [6]:
data.validate(raise_on_error=True)

True

In [7]:
print(data.keys())

print(data["x"])

for key, item in data:
    print(f"{key} found in data")

['x', 'edge_index']
tensor([[-1.],
        [ 0.],
        [ 1.]])
x found in data
edge_index found in data


In [8]:
"edge_attr" in data

False

In [9]:
data.num_nodes

3

In [10]:
data.num_edges

4

In [11]:
data.num_node_features

1

In [12]:
data.has_isolated_nodes()

False

In [13]:
data.is_directed()

False

# Common Benchmark Datasets

In [14]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES")
len(dataset)

600

In [15]:
dataset.num_classes

6

In [16]:
dataset.num_node_features

3

In [17]:
data = dataset[0]
data

Data(edge_index=[2, 168], x=[37, 3], y=[1])

In [18]:
dataset[0].num_nodes, data.y

(37, tensor([5]))

In [19]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root="/tmp/Cora", name="Cora")

In [20]:
len(dataset)

1

In [21]:
dataset.num_classes

7

In [22]:
dataset.num_node_features

1433

In [23]:
data = dataset[0]

data.train_mask.sum().item(), data.val_mask.sum().item(), data.test_mask.sum().item()

(140, 500, 1000)

# Mini-Batches

In [24]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader

dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES", use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

for batch in loader:
    print(batch)

    print(batch.num_graphs)

DataBatch(edge_index=[2, 4474], x=[1215, 21], y=[32], batch=[1215], ptr=[33])
32
DataBatch(edge_index=[2, 4024], x=[987, 21], y=[32], batch=[987], ptr=[33])
32
DataBatch(edge_index=[2, 3876], x=[1019, 21], y=[32], batch=[1019], ptr=[33])
32
DataBatch(edge_index=[2, 3996], x=[1006, 21], y=[32], batch=[1006], ptr=[33])
32
DataBatch(edge_index=[2, 3432], x=[879, 21], y=[32], batch=[879], ptr=[33])
32
DataBatch(edge_index=[2, 3848], x=[988, 21], y=[32], batch=[988], ptr=[33])
32
DataBatch(edge_index=[2, 3920], x=[997, 21], y=[32], batch=[997], ptr=[33])
32
DataBatch(edge_index=[2, 4490], x=[1213, 21], y=[32], batch=[1213], ptr=[33])
32
DataBatch(edge_index=[2, 4002], x=[1041, 21], y=[32], batch=[1041], ptr=[33])
32
DataBatch(edge_index=[2, 4100], x=[1124, 21], y=[32], batch=[1124], ptr=[33])
32
DataBatch(edge_index=[2, 3866], x=[985, 21], y=[32], batch=[985], ptr=[33])
32
DataBatch(edge_index=[2, 3714], x=[976, 21], y=[32], batch=[976], ptr=[33])
32
DataBatch(edge_index=[2, 4000], x=[1058,

In [25]:
# 1batchの各pointごとに1つの(sub-)グラフが含まれている

(len(loader) - 1) * 32 + 24

600

In [26]:
batch.x.shape, batch.y.shape

(torch.Size([803, 21]), torch.Size([24]))

In [27]:
batch.num_nodes

803

In [28]:
batch.x.shape

torch.Size([803, 21])

In [29]:
batch.batch

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5, 

In [30]:
import torch_scatter

x = torch_scatter.scatter(batch.x, batch.batch, dim=0, reduce="mean")
batch.x.shape, batch.batch.shape, x.shape

(torch.Size([803, 21]), torch.Size([803]), torch.Size([24, 21]))

In [31]:
batch.batch.unique().shape

torch.Size([24])

## Data Transforms

In [32]:
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(root="/tmp/ShapeNet", categories=["Airplane"])

dataset[0]

Downloading https://shapenet.cs.stanford.edu/media/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Extracting /tmp/ShapeNet/shapenetcore_partanno_segmentation_benchmark_v0_normal.zip
Processing...
Done!


Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [33]:
import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(
    root="/tmp/ShapeNet", categories=["Airplane"], pre_transform=T.KNNGraph(k=6)
)

dataset[0]



Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

In [34]:
import torch_geometric.transforms as T
from torch_geometric.datasets import ShapeNet

dataset = ShapeNet(
    root="/tmp/ShapeNet",
    categories=["Airplane"],
    pre_transform=T.KNNGraph(k=6),
    transform=T.RandomJitter(0.01),
)

dataset[0]

Data(x=[2518, 3], y=[2518], pos=[2518, 3], category=[1])

## Learning Methods on Graphs

In [35]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root="/tmp/Cora", name="Cora")

In [37]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [39]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.8070


In [45]:
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root="/tmp/IMDB", name="IMDB-BINARY")

Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip
Processing...
Done!


In [46]:
len(dataset)

1000

In [53]:
import numpy as np

rng = np.random.default_rng(1026)
idx = rng.permutation(len(dataset))
train_size, val_size, test_size = (
    int(0.8 * len(dataset)),
    int(0.1 * len(dataset)),
    int(0.1 * len(dataset)),
)
train_idx, val_idx, test_idx = (
    idx[:train_size],
    idx[train_size : train_size + val_size],
    idx[train_size + val_size : train_size + val_size + test_size],
)
train_dataset, val_dataset, test_dataset = (
    dataset[train_idx],
    dataset[val_idx],
    dataset[test_idx],
)

print(len(train_dataset), len(val_dataset), len(test_dataset))

800 100 100


IMDB-BINARY(800)