In [5]:
import sys
sys.executable # 'C:\\Users\\janet\\anaconda3\\envs\\GNN_SCHZ\\python.exe'

'C:\\Users\\janet\\anaconda3\\envs\\GNN_SCHZ\\python.exe'

In [7]:
import torch
print(torch.__version__)

2.1.0


## single graph Handling
torch_geometric.data.Data class

- data.x(num_nodes, num_node_features) : node feature matrix
- data.edge_index(2, 2*num_edges) : adjacency matrix를 만들 수 있음

data.edge_attr(num_edges, num_edge_features) : edge feature matrix

data.y : ground truth, target matrix

data.pos(num_nodes, num_dimensions) : 각 node의 위치

In [3]:
import torch

In [4]:
from torch_geometric.datasets import KarateClub

dataset= KarateClub()

print('============= Dataset =============')
print(f'Dataset : {dataset}')
print(f'graphs # : {len(dataset)}')
print(f'classes # : {dataset.num_classes}')
print('============= Graph 0 =============')
data = dataset[0]
print(data)
print('''
      x(node# , feature#) : feature matrix
      adjacency matrix는 edge_index를 통해 얻을 수 있음''')
print(f'nodes # : {data.num_nodes}')
print(f'edges # : {data.num_edges}')
print(f'features # per node : {dataset.num_node_features}')
print(f'features # per edge : {data.num_edge_features}')
print(f'Is undirected: {data.is_undirected()}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'node pair : {data.edge_index.t()}')

Dataset : KarateClub()
graphs # : 1
classes # : 4
Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])

      x(node# , feature#) : feature matrix
      adjacency matrix는 edge_index를 통해 얻을 수 있음
nodes # : 34
edges # : 156
features # per node : 34
features # per edge : 0
Is undirected: True
Contains isolated nodes: False
node pair : tensor([[ 0,  1],
        [ 0,  2],
        [ 0,  3],
        [ 0,  4],
        [ 0,  5],
        [ 0,  6],
        [ 0,  7],
        [ 0,  8],
        [ 0, 10],
        [ 0, 11],
        [ 0, 12],
        [ 0, 13],
        [ 0, 17],
        [ 0, 19],
        [ 0, 21],
        [ 0, 31],
        [ 1,  0],
        [ 1,  2],
        [ 1,  3],
        [ 1,  7],
        [ 1, 13],
        [ 1, 17],
        [ 1, 19],
        [ 1, 21],
        [ 1, 30],
        [ 2,  0],
        [ 2,  1],
        [ 2,  3],
        [ 2,  7],
        [ 2,  8],
        [ 2,  9],
        [ 2, 13],
        [ 2, 27],
        [ 2, 28],
        [ 2, 32],
        [ 3,  0],
        [

## Graph Classification

graph의 structural properties를 활용해 분류

### 1. Dataset Preprocessing

1. dataset 확인
2. train_test split
3. batch(grouping)

In [None]:
import torch
from torch_geometric.datasets import TUDataset

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

1. dataset 확인

In [10]:
# dataset 확인
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of features: {dataset.num_features}')

print()
data = dataset[0]  # Get the first graph object.
print(data)
print('=============================================================')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: MUTAG(188):
Number of graphs: 188
Number of classes: 2
Number of features: 7

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


2. train_test split

In [14]:
torch.manual_seed(0)
dataset = dataset.shuffle()

train_dataset = dataset[:150]
test_dataset = dataset[150:]

print(f'test ratio: {30/180:.2f}')
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

test ratio: 0.17
Number of training graphs: 150
Number of test graphs: 38


3. batch(grouping)

하나의 batch에 일정한 개수(batch_size)의 그래프 데이터들 합쳐서 저장

하나의 그래프 데이터에 속하는 것 : feature matrix, target matrix, adjacency matrix

batch object에서의 batch attribute : 하나의 batch에 있는 node들이 각각 무슨 graph에 속하는지 저장한 벡터

In [15]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2628], x=[1190, 7], edge_attr=[2628, 4], y=[64], batch=[1190], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2628], x=[1181, 7], edge_attr=[2628, 4], y=[64], batch=[1181], ptr=[65])

Step 3:
Number of graphs in the current batch: 22
DataBatch(edge_index=[2, 800], x=[364, 7], edge_attr=[800, 4], y=[22], batch=[364], ptr=[23])



### 2. Training GNN
1. node embedding
2. graph embedding(readout layer)
3. train classifier on the graph embedding

$$ E = mc^2 $$

1. node embedding

message passing을 통해 node를 저차원으로 embedding 

2. graph embedding (readout layer)

하나의 graph에 속하는 node들의 embedding 값을 합쳐서 graph embedding

합치는 방법은 다양하나, 가장 많이 쓰이는 건 node embedding 평균

$$E = mc^2$$

$$ 
\mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_ $$$$

$$ E = mc^2 $$

3. classifier training