# GNN工具库
当下GNN大火, 有两个库是最热门的: Deep Graph Library (DGL) 和 PyTorch Geometric (PyG). 
这两个库都很好用, 差别也不特别大 (DGL官网是有中文教程的)；但是PyG相对来说更基础一些, 教程与支持也更多一些

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx
from torch_geometric.datasets import KarateClub
%matplotlib inline

In [4]:
dataset = KarateClub()
G = to_networkx(dataset[0], to_undirected=True)

In [6]:
dataset[0]

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34])

In [7]:
print(dataset.num_features)
print(dataset.num_classes)

34
4


In [8]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(dataset.num_features, 4)
        self.conv2 = GCNConv(4, 4)
        self.conv3 = GCNConv(4, 2)
        self.classifier = Linear(2, dataset.num_classes)
    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index)
        h = h.tanh()
        h = self.conv2(h, edge_index)
        h = h.tanh()
        h = self.conv3(h, edge_index)
        h = h.tanh()
        out = self.classifier(h)
        return out, h

In [9]:
model = GCN()
print(model)

GCN(
  (conv1): GCNConv(34, 4)
  (conv2): GCNConv(4, 4)
  (conv3): GCNConv(4, 2)
  (classifier): Linear(in_features=2, out_features=4, bias=True)
)


In [10]:
out, h = model(dataset.x, dataset.edge_index)
out

tensor([[-0.1800,  0.6862,  0.1598,  0.1413],
        [-0.1946,  0.6587,  0.1327,  0.1069],
        [-0.1860,  0.6638,  0.1398,  0.1144],
        [-0.1890,  0.6638,  0.1388,  0.1139],
        [-0.1999,  0.6750,  0.1442,  0.1249],
        [-0.1905,  0.6825,  0.1533,  0.1351],
        [-0.1895,  0.6814,  0.1528,  0.1341],
        [-0.1931,  0.6610,  0.1351,  0.1099],
        [-0.1940,  0.6523,  0.1278,  0.0997],
        [-0.1952,  0.6484,  0.1243,  0.0950],
        [-0.1975,  0.6764,  0.1460,  0.1269],
        [-0.1836,  0.6835,  0.1564,  0.1375],
        [-0.1935,  0.6638,  0.1372,  0.1130],
        [-0.1913,  0.6574,  0.1328,  0.1060],
        [-0.2014,  0.6454,  0.1198,  0.0904],
        [-0.1991,  0.6445,  0.1198,  0.0898],
        [-0.1890,  0.6801,  0.1518,  0.1326],
        [-0.2028,  0.6605,  0.1315,  0.1076],
        [-0.1974,  0.6422,  0.1186,  0.0874],
        [-0.1994,  0.6531,  0.1267,  0.0997],
        [-0.1955,  0.6516,  0.1268,  0.0986],
        [-0.2052,  0.6534,  0.1250

In [12]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train(data):
    optimizer.zero_grad()
    out, h = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss, h
for epoch in range(10):
    loss, h = train(dataset)
    print(f'loss:{loss}')
    print(f'h:{h}')

loss:1.432432770729065
h:tensor([[ 0.0962,  0.0115],
        [ 0.0235,  0.0129],
        [ 0.0432,  0.0201],
        [ 0.0401,  0.0162],
        [ 0.0522, -0.0054],
        [ 0.0775,  0.0011],
        [ 0.0763,  0.0031],
        [ 0.0300,  0.0131],
        [ 0.0107,  0.0181],
        [ 0.0014,  0.0195],
        [ 0.0575, -0.0035],
        [ 0.0867,  0.0090],
        [ 0.0353,  0.0106],
        [ 0.0242,  0.0179],
        [-0.0114,  0.0138],
        [-0.0109,  0.0173],
        [ 0.0740,  0.0046],
        [ 0.0188,  0.0012],
        [-0.0139,  0.0210],
        [ 0.0068,  0.0108],
        [ 0.0077,  0.0168],
        [ 0.0014,  0.0033],
        [-0.0065,  0.0175],
        [ 0.0657,  0.0242],
        [ 0.1390,  0.0069],
        [ 0.1296,  0.0110],
        [-0.0140,  0.0309],
        [ 0.0852,  0.0189],
        [ 0.0415,  0.0141],
        [ 0.0032,  0.0333],
        [-0.0036,  0.0152],
        [ 0.0995,  0.0135],
        [ 0.0102,  0.0337],
        [ 0.0181,  0.0381]], grad_fn=<TanhBackward0

# 任务一：节点分类

In [1]:
from torch_geometric.datasets import Planetoid

# 加载cora数据集
dataset = Planetoid(root='./dataset/Cora', name='Cora')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import Counter

print(dataset.data)
print('包含的类别数:', dataset.num_classes)
print('边特征的维度:', dataset.num_edge_features)
print('节点特征的维度：', dataset.num_node_features)

print('边的数量:', dataset.data.edge_index.shape[1] / 2)
print('节点的数量:', dataset.data.x.shape[0])
print('节点属性特征:', dataset.data.x)
print("边:", dataset.data.edge_index)
print("节点类别数量：", sorted(Counter(dataset[0].y.tolist()).items()))

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
包含的类别数: 7
边特征的维度: 0
节点特征的维度： 1433
边的数量: 5278.0
节点的数量: 2708
节点属性特征: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
边: tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])
节点类别数量： [(0, 351), (1, 217), (2, 418), (3, 818), (4, 426), (5, 298), (6, 180)]




In [3]:
# 获取训练集、测试集、验证集数据量
print('训练集节点数量:', sum(dataset.data.train_mask))
print('验证集节点数量:', sum(dataset.data.val_mask))
print('测试集节点数量:', sum(dataset.data.test_mask))
# 检查数据集是否是无向图
print(dataset.data.is_undirected())

训练集节点数量: tensor(140)
验证集节点数量: tensor(500)
测试集节点数量: tensor(1000)
True


In [4]:
dataset.data.y

tensor([3, 4, 4,  ..., 3, 3, 3])

In [5]:
dataset.data.x
# dataset.data.y

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## 1.1 使用lightgbm进行节点分类

In [6]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb

In [7]:
X = dataset.data.x
y = dataset.data.y
le = LabelEncoder()

X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,stratify=y,random_state=123)



In [8]:
clf = lgb.LGBMClassifier(max_depth=3)
clf.fit(X_train, y_train)
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovr'))
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovo'))

0.9430189269234441
0.9404446743313917


In [9]:
y_valid

tensor([3, 2, 5, 4, 3, 1, 3, 5, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 6, 6, 1, 2, 5, 2,
        3, 5, 6, 0, 6, 5, 3, 2, 4, 5, 0, 4, 3, 2, 3, 4, 0, 3, 4, 3, 3, 0, 6, 2,
        3, 0, 6, 3, 3, 4, 5, 0, 3, 3, 6, 2, 2, 4, 2, 2, 3, 2, 4, 6, 2, 3, 2, 3,
        3, 0, 3, 1, 2, 5, 0, 2, 1, 3, 3, 4, 4, 3, 4, 3, 0, 5, 1, 3, 4, 2, 4, 5,
        0, 2, 2, 2, 1, 5, 2, 0, 6, 5, 5, 1, 0, 0, 3, 2, 0, 2, 0, 0, 3, 1, 4, 3,
        0, 2, 0, 3, 5, 3, 0, 5, 4, 2, 3, 3, 4, 2, 5, 2, 2, 3, 5, 0, 3, 2, 3, 0,
        3, 3, 1, 6, 0, 3, 2, 3, 0, 4, 4, 0, 3, 3, 0, 5, 5, 4, 4, 1, 3, 3, 4, 3,
        2, 4, 3, 3, 3, 6, 4, 3, 3, 4, 3, 0, 4, 5, 4, 0, 3, 3, 4, 3, 3, 3, 3, 3,
        0, 3, 3, 1, 3, 5, 4, 5, 3, 2, 4, 6, 2, 2, 3, 2, 5, 0, 2, 2, 5, 5, 3, 4,
        3, 0, 3, 3, 3, 0, 5, 0, 0, 2, 2, 2, 3, 6, 1, 1, 3, 3, 3, 0, 3, 2, 2, 3,
        2, 3, 5, 5, 4, 2, 3, 5, 4, 1, 5, 2, 2, 2, 4, 2, 3, 4, 3, 0, 0, 5, 3, 0,
        0, 3, 4, 2, 1, 2, 5, 3, 6, 2, 4, 3, 5, 4, 3, 3, 5, 4, 0, 3, 4, 3, 3, 5,
        0, 3, 4, 1, 3, 2, 3, 2, 6, 3, 1,

In [10]:
y_pred = clf.predict_proba(X_valid)
y_pred

array([[2.86107665e-01, 2.28544631e-02, 1.02919859e-01, ...,
        1.44850256e-01, 7.57919229e-02, 5.82738784e-02],
       [1.27110723e-02, 6.14518253e-02, 1.17196214e-01, ...,
        7.49725675e-02, 6.07052655e-02, 1.77596490e-02],
       [9.57913370e-02, 3.01656627e-02, 1.39774053e-01, ...,
        1.13701825e-01, 1.42378653e-01, 5.72503241e-02],
       ...,
       [1.41410829e-02, 6.06019201e-03, 1.49348031e-02, ...,
        2.03237378e-02, 4.90583700e-03, 6.65862439e-03],
       [3.97481008e-01, 4.99240999e-02, 2.28238399e-01, ...,
        2.85387279e-02, 5.98649840e-02, 6.65181161e-02],
       [3.25999807e-03, 5.07200630e-04, 9.82180698e-01, ...,
        3.71246518e-03, 1.75644501e-03, 1.20816185e-03]])

## 1.2 引入传统的图算法

In [11]:
X = pd.DataFrame(dataset.data.x, index=range(0,len(dataset.data.y))).reset_index()
X = X.rename(columns={'index':'node_id'})
X



Unnamed: 0,node_id,0,1,2,3,4,5,6,7,8,...,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2704,2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2705,2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2706,2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
import scipy as sp 
import networkx as nx
from torch_geometric.utils.convert import to_networkx

In [13]:
G = to_networkx(dataset.data)



In [14]:
X['degree'] = X.node_id.map(nx.degree_centrality(G))  # 度中心度
X['pagerank'] = X.node_id.map(nx.pagerank(G)) # PageRank 中心度
X['betweenness'] = X.node_id.map(nx.betweenness_centrality(G)) # 介数中心度
# nx.closeness_centrality(G) # 接近中心度
X['harmonic'] = X.node_id.map(nx.harmonic_centrality(G)) # 计算节点的谐波中心度

In [15]:
X

Unnamed: 0,node_id,0,1,2,3,4,5,6,7,8,...,1427,1428,1429,1430,1431,1432,degree,pagerank,betweenness,harmonic
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002216,0.000336,9.766154e-07,417.066364
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002216,0.000385,1.080477e-03,436.741533
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003694,0.000515,4.050816e-03,531.907612
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000739,0.000369,0.000000e+00,1.000000
4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003694,0.000396,5.511762e-04,458.192627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000739,0.000369,0.000000e+00,1.000000
2704,2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000739,0.000369,0.000000e+00,1.000000
2705,2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000739,0.000369,0.000000e+00,1.000000
2706,2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002955,0.000380,3.963528e-05,398.100924


In [16]:
print(len(list(nx.connected_components(G.to_undirected())))) # 查看有多少联通分量
list(nx.connected_components(G.to_undirected()))

78


[{0,
  1,
  2,
  4,
  5,
  6,
  8,
  9,
  10,
  11,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  24,
  25,
  27,
  28,
  29,
  30,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  93,
  94,
  95,
  96,
  97,
  98,
  100,
  101,
  102,
  103,
  104,
  105,
  107,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  118,
  119,
  120,
  121,
  124,
  125,
  126,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  165,
  166,
  169,
  170,
  171,
  172,
  173,
  174,
  175,
  176,
  177,
  17

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,stratify=y,random_state=123)

clf = lgb.LGBMClassifier(max_depth=3)
clf.fit(X_train, y_train)
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovr'))
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovo'))

0.9492256704888787
0.947189233070174


## 1.3 使用图嵌入
使用node2vec，引入节点的拓扑信息的表征

In [18]:
# 方法1：
from node2vec import Node2Vec

In [19]:
# #设置node2vec参数
# node2vec_ = Node2Vec(G,dimensions=32, #嵌入向量维度
#                      p=1, #回家参数
#                      q=3, #外出参数
#                      walk_length=10, #随机游走最大长度
#                      num_walks=600, #每个节点作为起始点生成的随机游走路径数
#                      workers=4 #并行线程数
#                     )
# # p=1, q=0.5, n_clusters=6。DFS深度优先搜索，挖掘同质社群
# # p=1, q=2, n_clusters=3。BFS宽度优先搜索，挖掘节点的结构功能。#训练Node2vec,参考文档见 gensim.models.Word2Vec
# model = node2vec_.fit(window=3,  #skip-gram窗口大小
#                       min_count=1, #忽略出现次数低于次数的节点，设置阈值
#                       batch_words=4 #每个线程处理的数据量
#                      )

In [20]:
# n2v = model.wv.vectors
# print(n2v.shape) 
# #查看Embedding
# shape_ = model.wv.get_vector('1').shape
# print(shape_)
# #(32,)#查看某个节点的embedding
# embedding_ = model.wv.get_vector('1')
# print(embedding_)

In [21]:
# node_embedding = {}
# for node in G.nodes:
#     node_embedding[node] = model.wv.get_vector(node)

In [22]:
# embedding_features = X.node_id.map(node_embedding)
# embedding_features

In [23]:
# embedding_features = np.stack(embedding_features, axis=0)
# embedding_features.shape

In [24]:
# embedding_features = pd.DataFrame(embedding_features, columns=['emb_'+str(i) for i in range(embedding_features.shape[1])])
# embedding_features

In [25]:
# for col in embedding_features.columns:
#     X[col] = embedding_features[col].values

In [26]:
# df = pd.concat([X,embedding_features], axis=1)

In [27]:
# df.head(2)

In [28]:
# X_train, X_valid, y_train, y_valid = train_test_split(df,y,train_size=0.8,stratify=y,random_state=123)

# clf = lgb.LGBMClassifier(max_depth=3)
# clf.fit(X_train, y_train)
# print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovr'))
# print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovo'))

In [29]:
# 方法2:

In [30]:
from nodevectors import ProNE
n2v_ = ProNE(n_components=32)
n2v_.fit(G)

  return adjacency_matrix(G, nodelist, dtype, weight)


In [31]:
node_embedding = {}
for node in G.nodes:
    node_embedding[node] = n2v_.predict(node)
embedding_features = X.node_id.map(node_embedding)
embedding_features = np.stack(embedding_features, axis=0)
embedding_features.shape
embedding_features = pd.DataFrame(embedding_features, columns=['emb_'+str(i) for i in range(embedding_features.shape[1])])

df2 = pd.concat([X,embedding_features], axis=1)

In [32]:
df2

Unnamed: 0,node_id,0,1,2,3,4,5,6,7,8,...,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.025145,-0.000809,-0.005791,-0.022498,-0.030820,-0.014778,0.008814,0.032128,0.148016,0.587833
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.030883,-0.045450,0.206974,-0.125497,0.066530,-0.116691,0.163054,0.030550,0.099273,0.070968
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041783,-0.111247,0.020196,0.061493,0.019615,0.013267,0.492620,0.071931,0.101089,0.263065
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.131912,0.094369,0.055185,-0.283367,0.221676,0.234822,0.199051,0.071967,-0.262039,-0.386990
4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.141980,0.120287,0.131507,-0.310852,-0.134201,-0.212942,0.260197,0.109889,-0.016846,-0.042834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,2703,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.067067,0.441672,-0.214737,-0.132747,0.149483,0.071430,0.034131,-0.037980,0.065397,-0.009703
2704,2704,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.078381,-0.151276,0.260473,-0.498793,0.130182,0.010768,-0.097518,0.032703,0.062985,-0.008407
2705,2705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.012143,-0.250367,0.056457,-0.042739,-0.064134,0.218479,-0.121135,-0.012935,0.033285,-0.119308
2706,2706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.035080,-0.023393,0.026374,0.121077,0.000434,0.143851,0.163036,-0.017640,-0.011307,0.026611


In [33]:
X_train, X_valid, y_train, y_valid = train_test_split(df2,y,train_size=0.8,stratify=y,random_state=123)

clf = lgb.LGBMClassifier(max_depth=3)
clf.fit(X_train, y_train)
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovr'))
print(metrics.roc_auc_score(y_valid, clf.predict_proba(X_valid), average='macro', multi_class='ovo'))

0.9816269764113018
0.9821537997129604


## 1.4 使用图深度学习算法

In [34]:
train_nodes, test_nodes = train_test_split(dataset.data.y, train_size=0.8, random_state=123, stratify=dataset.data.y)



In [35]:
from sklearn import preprocessing, feature_extraction, model_selection

label_encoding = preprocessing.LabelBinarizer()
train_labels = label_encoding.fit_transform(train_nodes)
test_labels = label_encoding.fit_transform(test_nodes)
test_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

### 1.4.1分类基线模型，进行节点分类

该模型忽略节点连接(或图结构)，并试图仅使用词向量对节点标签进行分类。模型类如下,它有两个隐藏层(Linear)，带有ReLU激活，后面是一个输出层。

In [36]:
import torch
import torch.nn as nn
import torch_geometric.transforms as T

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 将节点分为train、valid和test(替换数据中的原始分割掩码，因为它的训练集太小了)
split = T.RandomNodeSplit(num_val=0.1, num_test=0.2)
data = split(dataset[0])     
    
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
        nn.Linear(dataset[0].num_node_features, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, dataset.num_classes))
    def forward(self, data):
        x = data.x
        output = self.layers(x)
        return output

In [37]:
def train(model, data, optimizer, criterion, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        model.zero_grad()
        outputs = model(data)
        loss = criterion(outputs[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            pred = torch.max(outputs[data.train_mask].data, 1)[1].numpy()
            train_acc = metrics.accuracy_score(data.y[data.train_mask].numpy(), pred) # 训练集精度
            # 获取验证集上的精度
            dev_acc = evaluate(model, data, data.val_mask, criterion)
            print(f'Epoch: {epoch:d}, Train Loss: {loss:.3f}, Val Acc: {dev_acc:.3f}')
    return model

def evaluate(model, data, mask, criterion):
    model.eval()
    loss_total = 0
#     pred = model(data).argmax(dim=1)
#     correct = (pred[mask] == data.y[mask]).sum()
#     acc = int(correct) / int(mask.sum())
    y_all = np.array([], dtype=int)
    pred_all = np.array([], dtype=int)
    with torch.no_grad():
        outputs = model(data)
        loss = criterion(outputs[mask], data.y[mask])
        loss_total += loss
        y = data.y[mask].to(device).numpy()
        pred = torch.max(outputs[mask].data, 1)[1].to(device).numpy()
        y_all = np.append(y_all, y)
        pred_all = np.append(pred_all, pred)  
    acc = metrics.accuracy_score(y_all, pred_all)
    return acc

In [38]:
mlp = MLP().to(device)
optimizer_mlp = torch.optim.Adam(mlp.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
mlp = train(mlp, data, optimizer_mlp, criterion, n_epochs=150)
test_acc = evaluate(mlp, data, data.test_mask, criterion)
print(f'Test Acc: {test_acc:.3f}')

Epoch: 0, Train Loss: 1.941, Val Acc: 0.343
Epoch: 10, Train Loss: 0.818, Val Acc: 0.690
Epoch: 20, Train Loss: 0.075, Val Acc: 0.738
Epoch: 30, Train Loss: 0.016, Val Acc: 0.734
Epoch: 40, Train Loss: 0.014, Val Acc: 0.738
Epoch: 50, Train Loss: 0.015, Val Acc: 0.712
Epoch: 60, Train Loss: 0.013, Val Acc: 0.720
Epoch: 70, Train Loss: 0.010, Val Acc: 0.705
Epoch: 80, Train Loss: 0.009, Val Acc: 0.686
Epoch: 90, Train Loss: 0.008, Val Acc: 0.690
Epoch: 100, Train Loss: 0.008, Val Acc: 0.694
Epoch: 110, Train Loss: 0.008, Val Acc: 0.705
Epoch: 120, Train Loss: 0.007, Val Acc: 0.708
Epoch: 130, Train Loss: 0.007, Val Acc: 0.712
Epoch: 140, Train Loss: 0.007, Val Acc: 0.716
Test Acc: 0.721


### 1.4.2 使用GCN进行节点分类

In [39]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16
                             , dataset.num_classes)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        output = self.conv2(x, edge_index)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gcn = GCN().to(device)
optimizer_gcn = torch.optim.Adam(gcn.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
gcn = train(gcn, data, optimizer_mlp, criterion, n_epochs=120)
test_acc = evaluate(gcn, data, data.test_mask, criterion)
print(f'Test Acc: {test_acc:.3f}')

Epoch: 0, Train Loss: 1.935, Val Acc: 0.240
Epoch: 10, Train Loss: 1.935, Val Acc: 0.240
Epoch: 20, Train Loss: 1.935, Val Acc: 0.240
Epoch: 30, Train Loss: 1.935, Val Acc: 0.240
Epoch: 40, Train Loss: 1.935, Val Acc: 0.240
Epoch: 50, Train Loss: 1.935, Val Acc: 0.240
Epoch: 60, Train Loss: 1.935, Val Acc: 0.240
Epoch: 70, Train Loss: 1.935, Val Acc: 0.240
Epoch: 80, Train Loss: 1.935, Val Acc: 0.240
Epoch: 90, Train Loss: 1.935, Val Acc: 0.240
Epoch: 100, Train Loss: 1.935, Val Acc: 0.240
Epoch: 110, Train Loss: 1.935, Val Acc: 0.240
Test Acc: 0.210


## 任务二：链接预测
1. 编码器使用两个卷积层的图来创建节点嵌入。
2. 在原始图上随机添加负链接，模型任务变为对原始的正链接和新增的负链接进行二分类。
3. 解码器使用节点嵌入对所有边（包含负链接）进行链接预测（二分类）。从每条边上的一对节点计算节点嵌入的点积，然后聚合整个嵌入维度的值，并在每条边上创建一个表示边存在概率的值。


In [40]:
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        
    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]]*z[edge_label_index[1]]).sum(dim=-1)
    
    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj>0).nonzero(as_tuple=False).t()

In [41]:
def train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs):
    for epoch in range(1, n_epochs+1):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)
        
        
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse'
        )
        
        edge_label_index = torch.cat([train_data.edge_label_index, neg_edge_index], dim=-1)
        
        edge_label = torch.cat([train_data.edge_label, train_data.edge_label.new_zeros(neg_edge_index.size(1))],dim=0)
        
        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        val_auc = eval_link_predictor(model, val_data)
        if epoch % 10 == 0:
            print(f"Epoch: {epoch:d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}")
    return model
        
@torch.no_grad()
def eval_link_predictor(model, data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())   

In [42]:
import torch_geometric.transforms as T

# 使用PyG中的RandomNodeSplit模块将节点分为train、valid和test(我替换数据中的原始分割掩码，因为它的训练集太小了
split = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=False,
    neg_sampling_ratio=1.0,
)
data = dataset[0]
train_data, val_data, test_data = split(data)

In [43]:
print("data:", data)
print("train_data:", train_data)
print("val_data:", val_data)
print("test_data:", test_data)

data: Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
train_data: Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[4224], edge_label_index=[2, 4224])
val_data: Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054])
test_data: Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054])


In [44]:
model = Net(dataset.num_features, 128, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()
n_epochs = 150
model = train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs=150)
 
test_auc = eval_link_predictor(model, test_data)
print(f"Test: {test_auc:.3f}")

Epoch: 10, Train Loss: 0.665, Val AUC: 0.797
Epoch: 20, Train Loss: 0.664, Val AUC: 0.797
Epoch: 30, Train Loss: 0.664, Val AUC: 0.797
Epoch: 40, Train Loss: 0.664, Val AUC: 0.797
Epoch: 50, Train Loss: 0.664, Val AUC: 0.797
Epoch: 60, Train Loss: 0.664, Val AUC: 0.797
Epoch: 70, Train Loss: 0.664, Val AUC: 0.797
Epoch: 80, Train Loss: 0.664, Val AUC: 0.797
Epoch: 90, Train Loss: 0.664, Val AUC: 0.797
Epoch: 100, Train Loss: 0.664, Val AUC: 0.797
Epoch: 110, Train Loss: 0.664, Val AUC: 0.797
Epoch: 120, Train Loss: 0.664, Val AUC: 0.797
Epoch: 130, Train Loss: 0.664, Val AUC: 0.797
Epoch: 140, Train Loss: 0.664, Val AUC: 0.797
Epoch: 150, Train Loss: 0.664, Val AUC: 0.797
Test: 0.807


## 任务三：异常检测  
数据集有两种不同类型的异常值:  
* 1.结构异常  
    密集连接的节点，而不是稀疏连接的规则节点  
* 2.上下文的异常值  
    属性与相邻节点显著不同的节点

In [45]:
# 0:正常，1:仅上下文异常，2:结构异常，3:上下文和结构都异常
# PyGOD库,是建立在PyG之上的一个图异常值检测库。可以通过PyGOD模块加载已经进行了异常值注入的Cora数据集。
from pygod.utils import load_data
from collections import Counter

In [46]:
data = load_data('inj_cora','./dataset')
data

Data(x=[2708, 1433], edge_index=[2, 11060], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [47]:
Counter(data.y.tolist())

Counter({0: 2570, 1: 68, 2: 68, 3: 2})

In [48]:
import torch_geometric.transforms as T
 
split = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    is_undirected=True,
    add_negative_train_samples=False,
    neg_sampling_ratio=1.0,
)
train_data, val_data, test_data = split(data)

In [49]:
train_data

Data(x=[2708, 1433], edge_index=[2, 7750], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[3875], edge_label_index=[2, 3875])

In [50]:
from pygod.detector import DOMINANT
from sklearn.metrics import roc_auc_score, average_precision_score

def train_anormal_detector(model, data):
    return model.fit(data)

def eval_anomal_detector(model, data):
    score = model.decision_score_
    pred, score = model.predict(data, return_score=True)
    correct = (pred == data.y).sum()
    acc = int(correct)/int(data.y.shape[0])
    return acc

In [None]:
model = DOMINANT()
model = train_anormal_detector(model, data)

In [None]:
eval_anomal_detector(model, data)