<a href="https://colab.research.google.com/github/kiasar/gnn_CiteSeer_classifier/blob/main/A3_E2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip install yacs
!pip install pytorch_lightning

In [None]:
import torch
import torch.nn.functional as F

import torchvision
from torch_geometric import nn
from torch_geometric.graphgym import optim
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GINConv
from torch_geometric.nn import MLP

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import numpy as np
from tqdm import tqdm

print(torch.__version__)
print(matplotlib.__version__)
print(np.__version__)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import torch
# emb: (nNodes, hidden_dim)
# node_type: (nNodes,). Entries are torch.int64 ranged from 0 to num_class - 1
def visualize(emb: torch.tensor, node_type: torch.tensor):
  z = TSNE(n_components=2).fit_transform(emb.detach().cpu().numpy())
  plt.figure(figsize=(10,10))
  plt.scatter(z[:, 0], z[:, 1], s=70, c=node_type, cmap="Set2")
  plt.show()

In [4]:
hidden_dim = 64
num_layers = 2
learning_rate = 0.01

# PART 1

In [5]:
# CiteSeer
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
dataset = Planetoid(root="data/Planetoid", name='CiteSeer',
transform=NormalizeFeatures())
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
data = dataset[0]
print(data)
## outputs:
# Dataset: CiteSeer():
# ======================
# Number of graphs: 1
# Number of features: 3703
# Number of classes: 6
# Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...


Dataset: CiteSeer():
Number of graphs: 1
Number of features: 3703
Number of classes: 6
Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])


Done!


In [6]:
class GraphClassifier(torch.nn.Module):
    def __init__(self, dataset, hidden_dim):
        super(GraphClassifier, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, dataset.num_classes)

    def forward(self, data, do_visualize=False):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        if do_visualize:
          visualize(x, data.y)
        # x = F.dropout(x, p=0.5, training=self.training)
        return F.log_softmax(x, dim=1)

In [None]:
model = GraphClassifier(dataset, hidden_dim)
print(model)
model(dataset[0].to(device), do_visualize=True)

In [9]:
def train(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    label = out.max(1)[1]
    label[data.train_mask] = data.y[data.train_mask]
    label.requires_grad = False

    loss = F.nll_loss(out[data.train_mask], label[data.train_mask])

    loss.backward(retain_graph=True)
    optimizer.step()

In [None]:
def evaluate(model, data):
    model.eval()

    with torch.no_grad():
        logits = model(data)

    outs = {}
    for key in ['train', 'val', 'test']:
        mask = data[f'{key}_mask']
        loss = F.nll_loss(logits[mask], data.y[mask]).item()
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()

        outs[f'{key} loss'] = loss
        outs[f'{key} acc'] = acc

    return outs

In [None]:
def run(dataset, model, epochs, optimizer, lossF):
    traning_loss, test_acc, train_acc = [], [], []

    data = dataset[0].to(device)

    val_loss_history = []

    for epoch in range(1, epochs + 1):
        train(model, optimizer, data)
        eval_info = evaluate(model, data)

        traning_loss.append(eval_info["train loss"])
        train_acc.append(eval_info["train acc"])
        test_acc.append(eval_info['test acc'])
    
    return test_acc, train_acc, traning_loss

In [None]:
model = GraphClassifier(dataset, hidden_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
lossF = torch.nn.CrossEntropyLoss()

test_acc, train_acc, traning_loss = run(dataset, model,  200,  optimizer, lossF)

In [None]:
classification_test_error = [100 - 100*i for i in test_acc]
classification_train_error = [100 - 100*i for i in train_acc]

In [None]:
# plot the training loss and classification error on training set w.r.t. iteration
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("loss")
plt.title("training loss")
plt.plot(x_axis, traning_loss, label = "tranin loss")
plt.legend()
plt.show()

In [None]:
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("Test error precentage")
plt.title("classification error test")
plt.plot(x_axis, classification_test_error, label = "Error precentage")
plt.legend()
plt.show()

In [None]:
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("Train error precentage")
plt.title("classification error train")
plt.plot(x_axis, classification_train_error, label = "Error precentage")
plt.legend()
plt.show()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import torch
# emb: (nNodes, hidden_dim)
# node_type: (nNodes,). Entries are torch.int64 ranged from 0 to num_class - 1
def visualize(emb: torch.tensor, node_type: torch.tensor):
  z = TSNE(n_components=2).fit_transform(emb.detach().cpu().numpy())
  plt.figure(figsize=(10,10))
  plt.scatter(z[:, 0], z[:, 1], s=70, c=node_type, cmap="Set2")
  plt.show()


visualize()

PART 2

In [None]:
class GraphClassifier2(torch.nn.Module):
    def __init__(self, dataset, hidden_dim):
        super(GraphClassifier2, self).__init__()
        mlp1 = MLP([dataset.num_features, hidden_dim])
        self.conv1 = GINConv(mlp1)
        mlp2 = MLP([hidden_dim, dataset.num_classes])
        self.conv2 = GINConv(mlp2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        # x = F.dropout(x, p=0.5, training=self.training)
        return F.log_softmax(x, dim=1)

In [None]:
model = GraphClassifier2(dataset, hidden_dim)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
lossF = torch.nn.CrossEntropyLoss()

test_acc, train_acc, traning_loss = run(dataset, model,  200,  optimizer, lossF)

In [None]:
# plot the training loss and classification error on training set w.r.t. iteration
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("loss")
plt.title("training loss")
plt.plot(x_axis, traning_loss, label = "tranin loss")
plt.legend()
plt.show()

In [None]:
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("Train error precentage")
plt.title("classification error train")
plt.plot(x_axis, classification_train_error, label = "Error precentage")
plt.legend()
plt.show()

In [None]:
x_axis = list(range(1,201))
plt.xlabel("Epoch")
plt.ylabel("Test error precentage")
plt.title("classification error test")
plt.plot(x_axis, classification_test_error, label = "Error precentage")
plt.legend()
plt.show()

PART 3

In [None]:
# MUTAG

from torch_geometric.datasets import TUDataset
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.loader import DataLoader
# dataset = KarateClub(transform=NormalizeFeatures())
dataset = TUDataset(root='data/TUDataset', name='MUTAG',
transform=NormalizeFeatures())
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(dataset[0])
train_dataset = dataset[: int(len(dataset) * 0.8)]
test_dataset = dataset[int(len(dataset) * 0.8): ]
print('==== train_dataset =====')
print(train_dataset)
print('==== test_dataset =====')
print(test_dataset)
## outputs:
# Dataset: MUTAG(188):
# ======================
# Number of graphs: 188
# Number of features: 7
# Number of classes: 2
# ==== train_dataset =====
# MUTAG(150)
# ==== test_dataset =====
# MUTAG(38)

# Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

In [None]:
class GraphClassifier3(torch.nn.Module):
    def __init__(self, dataset, hidden_dim):
        super(GraphClassifier3, self).__init__()
        mlp1 = MLP([dataset.num_features, hidden_dim])
        self.conv1 = GINConv(mlp1)
        mlp2 = MLP([hidden_dim, dataset.num_classes])
        self.conv2 = GINConv(mlp2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        # x = F.dropout(x, p=0.5, training=self.training)
        return F.log_softmax(x, dim=1)

In [None]:
model = GraphClassifier3(dataset, hidden_dim)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
lossF = torch.nn.CrossEntropyLoss()

test_acc, train_acc, traning_loss = run(dataset, model,  200,  optimizer, lossF)