# Node Classification on Cora with GCN

This notebook implements a simple Graph Convolutional Network (GCN) to classify nodes in the Cora dataset using PyTorch Geometric.


In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv

print("Libraries imported.")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported.


In [2]:
dataset = Planetoid(root='/tmp/Cora', name='Cora', transform=T.NormalizeFeatures())
data = dataset[0]

print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

print(f'Data object: {data}')

Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7
Data object: Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [3]:
def compute_jaccard_weights(data):
    from torch_geometric.utils import to_dense_adj
    
    adj = to_dense_adj(data.edge_index, max_num_nodes=data.num_nodes)[0]
    
    intersection = torch.matmul(adj, adj)
    
    row, col = data.edge_index
    
    edge_intersection = intersection[row, col]
    
    deg = adj.sum(dim=1)
    
    edge_union = deg[row] + deg[col] - edge_intersection
    
    jaccard_dist = 1 - edge_intersection / (edge_union + 1e-7)
    
    data.edge_weight = jaccard_dist
    
    return data

data = compute_jaccard_weights(data)
print("Jaccard weights computed.")
print(f"Edge weights statistics: Min={data.edge_weight.min():.4f}, Max={data.edge_weight.max():.4f}, Mean={data.edge_weight.mean():.4f}")

Jaccard weights computed.
Edge weights statistics: Min=0.3333, Max=1.0000, Mean=0.9190


In [4]:
def sparsify_graph(data):
    print(f"Original edges: {data.edge_index.shape[1]}")
    
    N = data.num_nodes
    W = torch.full((N, N), float('inf'), device=data.edge_index.device)
    
    u, v = data.edge_index
    W[u, v] = data.edge_weight
    
    W.fill_diagonal_(float('inf'))
    
    num_edges = data.edge_index.shape[1]
    batch_size = 100
    mask = torch.ones(num_edges, dtype=torch.bool, device=data.edge_index.device)
    
    for i in range(0, num_edges, batch_size):
        end = min(i + batch_size, num_edges)
        batch_u = u[i:end]
        batch_v = v[i:end]
        batch_w = data.edge_weight[i:end]
        
        dists = W[batch_u] + W[batch_v]
        
        min_indirect, _ = dists.min(dim=1)
        
        redundant = min_indirect <= (batch_w + 1e-6)
        
        mask[i:end] = ~redundant
        
    data.edge_index = data.edge_index[:, mask]
    data.edge_weight = data.edge_weight[mask]
    
    print(f"Final edges: {data.edge_index.shape[1]}")
    print(f"Removed {num_edges - data.edge_index.shape[1]} edges ({100 - data.edge_index.shape[1]/num_edges*100:.1f}%)")
    
    return data

data = sparsify_graph(data)

Original edges: 10556
Final edges: 10556
Removed 0 edges (0.0%)
Final edges: 10556
Removed 0 edges (0.0%)


In [5]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

print(model)

GCN(
  (conv1): GCNConv(1433, 16)
  (conv2): GCNConv(16, 7)
)


In [6]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    test_correct = pred[data.test_mask] == data.y[data.test_mask]
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
    return test_acc

In [7]:
print("Starting training on full graph...")
for epoch in range(1, 201):
    loss = train()
    if epoch % 20 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Starting training on full graph...
Epoch: 020, Loss: 1.7034
Epoch: 020, Loss: 1.7034
Epoch: 040, Loss: 1.2872
Epoch: 040, Loss: 1.2872
Epoch: 060, Loss: 0.8776
Epoch: 060, Loss: 0.8776
Epoch: 080, Loss: 0.6693
Epoch: 080, Loss: 0.6693
Epoch: 100, Loss: 0.5251
Epoch: 100, Loss: 0.5251
Epoch: 120, Loss: 0.4513
Epoch: 120, Loss: 0.4513
Epoch: 140, Loss: 0.3872
Epoch: 140, Loss: 0.3872
Epoch: 160, Loss: 0.3690
Epoch: 160, Loss: 0.3690
Epoch: 180, Loss: 0.3241
Epoch: 180, Loss: 0.3241
Epoch: 200, Loss: 0.3018
Test Accuracy: 0.7990
Epoch: 200, Loss: 0.3018
Test Accuracy: 0.7990


In [8]:
print("\nTraining on SPARSIFIED graph...")

model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

for epoch in range(1, 201):
    loss = train()
    if epoch % 20 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

final_acc = test()
print(f'Final Test Accuracy on Sparsified Graph: {final_acc:.4f}')


Training on SPARSIFIED graph...
Epoch: 020, Loss: 1.7023
Epoch: 020, Loss: 1.7023
Epoch: 040, Loss: 1.3296
Epoch: 040, Loss: 1.3296
Epoch: 060, Loss: 0.9234
Epoch: 060, Loss: 0.9234
Epoch: 080, Loss: 0.7091
Epoch: 080, Loss: 0.7091
Epoch: 100, Loss: 0.5201
Epoch: 100, Loss: 0.5201
Epoch: 120, Loss: 0.4820
Epoch: 120, Loss: 0.4820
Epoch: 140, Loss: 0.4077
Epoch: 140, Loss: 0.4077
Epoch: 160, Loss: 0.4016
Epoch: 160, Loss: 0.4016
Epoch: 180, Loss: 0.3524
Epoch: 180, Loss: 0.3524
Epoch: 200, Loss: 0.3044
Final Test Accuracy on Sparsified Graph: 0.7970
Epoch: 200, Loss: 0.3044
Final Test Accuracy on Sparsified Graph: 0.7970
