<a href="https://colab.research.google.com/github/hadwin-357/GCN/blob/main/Scaling_GNNs_working_with_large_graphes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.1.0+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [3]:
# for a larege graph, we have to partition it into many clusters
# based on Cluster-GCN
# examply on PubMed citation

import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planeoid', name='PubMed', transform=NormalizeFeatures())

print(f'dataset: {dataset}')
print(f'num of graphes: {len(dataset)}')
print(f'num of node features:{dataset.num_features}')
print(f'num of classes:{dataset.num_classes}')

# the first data
data = dataset[0]

print(f'data: {data}')
print(f'Num of Nodes:{data.num_nodes}')
print(f'Num of edges:{data.num_edges}')
print(f'Average node degree: {data.num_edges/ data.num_nodes: .2f}')
print(f'has self_loops: {data.has_self_loops()}')
print(f'has isolated nodes: {data.has_isolated_nodes()}')
print(f'is directional: {data.is_directed()}')



Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...


dataset: PubMed()
num of graphes: 1
num of node features:500
num of classes:3
data: Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Num of Nodes:19717
Num of edges:88648
Average node degree:  4.50
has self_loops: False
has isolated nodes: False
is directional: False


Done!


In [4]:
# use Cluster Data and ClusterLoader to cluster nodes
#source code for ClusterData https://pytorch-geometric.readthedocs.io/en/1.4.3/_modules/torch_geometric/data/cluster.html
from torch_geometric.loader import ClusterData, ClusterLoader

torch.manual_seed(42)
# The first step: cluster data
cluster_data = ClusterData(data, num_parts=128) # create 128 subgraphes
# concat clustered nodes into a larger graph in a stochastic fashion
train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True) # make 4 batches

next = next(iter(train_loader))

print(next)






Computing METIS partitioning...


Data(x=[4909, 500], y=[4909], train_mask=[4909], val_mask=[4909], test_mask=[4909], edge_index=[2, 15304])


Done!


In [11]:
next.test_mask == next.val_mask

tensor([True, True, True,  ..., True, True, True])

In [5]:
#build model_0
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear

class GCN(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.conv1=GCNConv(in_channels=dataset.num_features, out_channels=hidden_channels)
    self.conv2 =GCNConv(in_channels= hidden_channels, out_channels=hidden_channels)
    self.lin1 = Linear(in_features = hidden_channels, out_features= dataset.num_classes)
  def forward(self, x, edge_index):
    x = self.conv1 (x, edge_index)
    x = x.relu()
    x = F.dropout(x, p=0.5, training=self.training)
    x = self.conv2(x, edge_index)
    x = self.lin1(x)

    return x

model_0 = GCN(hidden_channels=16)
model_0




GCN(
  (conv1): GCNConv(500, 16)
  (conv2): GCNConv(16, 16)
  (lin1): Linear(in_features=16, out_features=3, bias=True)
)

In [10]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
import torch_geometric

optimizer = torch.optim.Adam(model_0.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

def train(model:torch.nn.Module, dataloader: torch_geometric.data.DataLoader, loss_fn:torch.nn.Module, optimizer: torch.optim.Optimizer):
  model.train()
  train_loss, train_acc = 0, 0
  for subdata in dataloader:
    out = model(subdata.x, subdata.edge_index)
    loss = loss_fn(out[subdata.train_mask], subdata.y[subdata.train_mask])
    train_loss +=loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# for test, no need to partion the graph

def test(model:torch.nn.Module, data: torch_geometric.data.Data, loss_fn:torch.nn.Module ):
  model.eval()
  with torch.inference_mode():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    accs =[]
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
      correct = (pred[mask]==data.y[mask]).sum().item()
      accs.append(correct/int(mask.sum()))

  return accs

for epoch in range(1, 101):
  loss = train(model_0, train_loader, loss_fn=loss_fn, optimizer=optimizer)
  train_acc, val_acc, test_acc = test(model_0, data=data, loss_fn=loss_fn)
  print(f'Epoch: {epoch} train: {train_acc: 3f} val :{val_acc:.3f} test: {test_acc:3f}')



<IPython.core.display.Javascript object>

Epoch: 1 train:  0.333333 val :0.196 test: 0.180000
Epoch: 2 train:  0.533333 val :0.414 test: 0.397000
Epoch: 3 train:  0.650000 val :0.536 test: 0.527000
Epoch: 4 train:  0.700000 val :0.554 test: 0.556000
Epoch: 5 train:  0.716667 val :0.576 test: 0.578000
Epoch: 6 train:  0.866667 val :0.700 test: 0.690000
Epoch: 7 train:  0.916667 val :0.754 test: 0.738000
Epoch: 8 train:  0.900000 val :0.704 test: 0.691000
Epoch: 9 train:  0.950000 val :0.744 test: 0.729000
Epoch: 10 train:  0.850000 val :0.634 test: 0.610000
Epoch: 11 train:  0.983333 val :0.708 test: 0.683000
Epoch: 12 train:  0.950000 val :0.742 test: 0.719000
Epoch: 13 train:  0.983333 val :0.762 test: 0.742000
Epoch: 14 train:  1.000000 val :0.782 test: 0.736000
Epoch: 15 train:  1.000000 val :0.778 test: 0.747000
Epoch: 16 train:  0.983333 val :0.762 test: 0.751000
Epoch: 17 train:  1.000000 val :0.780 test: 0.732000
Epoch: 18 train:  1.000000 val :0.778 test: 0.722000
Epoch: 19 train:  0.983333 val :0.764 test: 0.745000
Ep