## GCN + K-menas

In [None]:
# Install necessary packages (uncomment if not installed)
# !pip install torch torchvision torchaudio
# !pip install torch-geometric
# !pip install scikit-learn
# !pip install matplotlib
# !pip install seaborn
# !pip install umap-learn

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv, VGAE
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns
import umap

import numpy as np

# 1. Load the Cora dataset
dataset = Planetoid(root='data/Planetoid', name='Cora', transform=T.NormalizeFeatures())
data = dataset[0]

print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')

# 2. Define the GCN Encoder
class GCNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv_mu = GCNConv(hidden_channels, out_channels)
        self.conv_logvar = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)

# 3. Initialize the model and optimizer
input_dim = dataset.num_features
hidden_dim = 128
latent_dim = 64

encoder = GCNEncoder(input_dim, hidden_dim, latent_dim)
model = VGAE(encoder)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)

# 4. Train the model
num_epochs = 200

model.train()
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)
    loss = model.recon_loss(z, data.edge_index) + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        print(f'Epoch: {epoch:03d}, Loss: {loss.item():.4f}')

# 5. Extract embeddings
model.eval()
with torch.no_grad():
    z = model.encode(data.x, data.edge_index)
    embeddings = z.cpu().numpy()

print(f'Embeddings shape: {embeddings.shape}')

# 6. Apply KMeans clustering
num_clusters = dataset.num_classes
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

print(f'Cluster labels shape: {cluster_labels.shape}')

# 7. Evaluation
true_labels = data.y.cpu().numpy()
ari = adjusted_rand_score(true_labels, cluster_labels)
silhouette = silhouette_score(embeddings, cluster_labels)

print(f'Adjusted Rand Index (ARI): {ari:.4f}')
print(f'Silhouette Score: {silhouette:.4f}')

# 8. Visualization with t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(12, 6))

# True Labels
plt.subplot(1, 2, 1)
sns.scatterplot(x=embeddings_2d[:,0], y=embeddings_2d[:,1], hue=true_labels, palette='tab10', s=10)
plt.title('t-SNE - True Labels')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Cluster Labels
plt.subplot(1, 2, 2)
sns.scatterplot(x=embeddings_2d[:,0], y=embeddings_2d[:,1], hue=cluster_labels, palette='tab10', s=10)
plt.title('t-SNE - KMeans Clusters')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# 9. Visualization with UMAP
reducer = umap.UMAP(random_state=42)
embeddings_umap = reducer.fit_transform(embeddings)

plt.figure(figsize=(12, 6))

# True Labels
plt.subplot(1, 2, 1)
sns.scatterplot(x=embeddings_umap[:,0], y=embeddings_umap[:,1], hue=true_labels, palette='tab10', s=10)
plt.title('UMAP - True Labels')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Cluster Labels
plt.subplot(1, 2, 2)
sns.scatterplot(x=embeddings_umap[:,0], y=embeddings_umap[:,1], hue=cluster_labels, palette='tab10', s=10)
plt.title('UMAP - KMeans Clusters')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, VGAE
from torch_geometric.data import Data

# Define the encoder as a simple GCN
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2_mu = GCNConv(hidden_dim, latent_dim)
        self.conv2_logvar = GCNConv(hidden_dim, latent_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2_mu(x, edge_index), self.conv2_logvar(x, edge_index)

# Create the VAE model
class GraphVAE(VGAE):
    def __init__(self, encoder):
        super(GraphVAE, self).__init__(encoder)

# Load your event graphs as Data objects (from NetworkX or directly)
# Example:
edge_index = torch.tensor([[0, 1], [1, 0]], dtype=torch.long)  # example edge list
x = torch.eye(2)  # example feature matrix (identity)

# Create the graph data object
data = Data(x=x, edge_index=edge_index)

# Define encoder and VAE model
encoder = Encoder(input_dim=2, hidden_dim=4, latent_dim=2)  # Modify dimensions as needed
model = GraphVAE(encoder)

# Forward pass through the VAE
z = model.encode(data.x, data.edge_index)

# You can now train the model and extract the embeddings


In [4]:
data

Data(x=[2, 2], edge_index=[2, 2])

In [5]:
data.x

tensor([[1., 0.],
        [0., 1.]])

In [6]:
data.edge_index

tensor([[0, 1],
        [1, 0]])