In [1]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder 
from sklearn.cluster import KMeans
from torch_geometric.datasets import Planetoid
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = Planetoid(root='./dataset/', name='Cora')

## 使用标准的GCN训练Cora数据集

In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        y = self.conv2(x, edge_index)

        return x, F.log_softmax(y, dim=1)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    _, out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

model.eval()
_, pred = model(data)
pred = pred.argmax(dim=1)
correct = (pred == data.y).sum()
acc = int(correct) / int(len(data.y))
print(f'Accuracy: {acc:.4f}')
embed, _ = model(data)

Accuracy: 0.8124


## 使用自建的模型训练Cora数据集

In [5]:
from models import my_loss, my_model
from sklearn.decomposition import PCA
import numpy as np

components = 800
out_features = 16
seed = 2022
cluster = 7
lr = 1e-3
epochs = 100

pca = PCA(n_components=components).fit(data.x.detach().numpy()).transform(data.x.detach().numpy())
data.x = torch.FloatTensor(pca)

torch.manual_seed(seed)
labels = np.array([int(x) for x in data.y])
model_1 = my_model(components, out_features).to(device)
loss_func = my_loss(cluster, seed, device).to(device)
optim = torch.optim.Adam(model_1.parameters(), lr=lr, weight_decay=5e-4)

model_1.train()
for _ in tqdm(range(epochs)):
    optim.zero_grad()
    output = model_1(data)
    loss = loss_func(output, labels)
    loss.backward()
    optim.step()

model_1.eval()
with torch.no_grad():
    output = model_1(data)

100%|██████████| 100/100 [00:51<00:00,  1.95it/s]


## 比较两种方法学习得到的embedding的聚类效果

In [6]:
k_means = KMeans(n_clusters=cluster).fit(output.detach().numpy()).predict(output.detach().numpy())
pred = LabelEncoder().fit_transform(k_means)
k_means_embed = KMeans(n_clusters=cluster).fit(embed.detach().numpy()).predict(embed.detach().numpy())
pred_embed = LabelEncoder().fit_transform(k_means_embed)
print('pred:', metrics.adjusted_rand_score(labels, pred))
print('pred_embed:', metrics.adjusted_rand_score(labels, pred_embed))

pred: 0.6007630968264628
pred_embed: 0.543857211164327
