# 实现Metapath2vec

通过**numpy和gensim**进行实现

用于处理异质网络的一种网络表示学习模型，采用：**元路径、随机游走、Skip-gram**

论文下载地址：[metapath2vec: Scalable Representation Learning for Heterogeneous Networks 
](http://hanj.cs.illinois.edu/cs512/survey_slides/4-5-metapath2vec-KDD17.pdf)

通过gensim库进行实现，包括两部分：**采样、Skip-gram处理**

## 1. 导入数据

使用同HAN一样的小型异质数据集，ACM论文子集。

In [1]:
import gensim
import numpy as np
import matplotlib.pyplot as plt
import pickle
import random
import gc
import os
import sklearn
from pathlib import Path

In [2]:
data_path = Path('../datasets/ACM/acm_data.pkl')
data_path.exists()

True

In [5]:
data = pickle.load(data_path.open('rb'))

In [6]:
data

{'features': array([[1., 1., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]], dtype=float32),
 'labels': array([0, 0, 0, ..., 2, 2, 2]),
 'PSP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.],
        ...,
        [0., 0., 1., ..., 1., 0., 1.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.]], dtype=float32),
 'PAP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]], dtype=float32),
 'train_mask': array([ True,  True,  True, ..., False, False, False]),
 'val_mask': array([False, False, False, ..., False, False, False]),

In [7]:
features, labels, PSP, PAP, train_mask, val_mask, test_mask = \
    data['features'], data['labels'], data['PSP'], data['PAP'], \
    data['train_mask'], data['val_mask'], data['test_mask']

In [83]:
features

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [None]:
PSP = PSP.astype(np.int)
PAP = PAP.astype(np.int)

In [81]:
PSP.dtype, PAP.dtype, train_mask.dtype, labels.dtype

(dtype('float32'), dtype('float32'), dtype('bool'), dtype('int64'))

# 2. 采样

共有两个元路径：**PAP，PSP**

In [35]:
gc.collect()

5581

In [91]:
PAP.nonzero()[0].dtype

dtype('int64')

In [105]:
def matrix_to_adj(matrix):
    '''
    将链路信息矩阵转为字典形式，只存储存在链路的节点
    '''
    adj = {}
    rows, cols = matrix.nonzero()
    rows = rows.astype(np.str)
    cols = cols.astype(np.str)
    for row, col in zip(rows, cols):
        adj.setdefault(row, [])
        adj[row].append(col)
    return adj

In [106]:
PAP_adj = matrix_to_adj(PAP)
PSP_adj = matrix_to_adj(PSP)

In [110]:
len(PSP_adj.keys())

3025

In [111]:
def sampler(adj, walk_length, walks):
    '''
    进行随机游走采样，按照等概率进行邻接点选择
    adj: 字典形式的邻节点信息结构
    walks: 对于每个节点，以其作为开始节点独立构造游走序列的次数
    walk_length: 每个游走序列的长度
    '''
    walk_sequences = []
    for i in range(walks):
        for ele in adj.keys():
            tmp_walk_sequence = []
            cur_node = ele
            for j in range(walk_length):
                tmp_walk_sequence.append(cur_node)
                cur_node = random.choice(adj[cur_node])
            walk_sequences.append(tmp_walk_sequence)
    return walk_sequences

In [112]:
walk_length = 100
walks = 100
latent_dimension = 128
neg_sampling = 5
min_count = 1
window = 5

In [113]:
PAP_sequences = sampler(PAP_adj, walk_length=walk_length, walks=walks)
PSP_sequences = sampler(PSP_adj, walk_length=walk_length, walks=walks)

# 3. 建模

上一步已经完成了对节点的随机游走采样，接下来可以将多个元路径合并为一个，然后输入进Skip-gram模型

In [114]:
# 合并序列
sequences = PAP_sequences + PSP_sequences

In [115]:
len(sequences)

605000

In [116]:
os.cpu_count()

32

In [118]:
# 训练并生成模型
model = gensim.models.word2vec.Word2Vec(sentences=sequences, 
                                        size=128, 
                                        window=window,
                                        workers=os.cpu_count(),
                                        sg=1, # skip-gram
                                        hs=0, # negtive sampling
                                        negative=5,
                                       )

In [137]:
model.save('metapath2vec.model')
model.wv.save('metapath2vec.wv')

In [164]:
word_vectors = model.wv
graph_features = model.wv.vectors
# 获取与graph_features相对应的index，是乱序的
graph_index = model.wv.index2word
graph_index = [int(ele) for ele in graph_index]

In [179]:
# 重新对graph_features排序，使其符合从0-3024的排序
re_index = np.argsort(graph_index)
graph_features = graph_features[re_index]

In [187]:
# train, val, test对应的index
train_index = np.arange(3025)[train_mask]
val_index = np.arange(3025)[val_mask]
test_index = np.arange(3025)[test_mask]

In [158]:
import xgboost

In [190]:
graph_features[val_index].shape

(300, 128)

In [198]:
# 只利用图特征训练
xgb_clf = xgboost.XGBClassifier(max_depth=2, n_estimators=200)
xgb_clf.fit(graph_features[train_index], labels[train_index])
y_pred = xgb_clf.predict(graph_features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.3971764705882353

In [199]:
# 只利用paper自带特征训练
xgb_clf = xgboost.XGBClassifier(max_depth=2, n_estimators=200)
xgb_clf.fit(features[train_index], labels[train_index])
y_pred = xgb_clf.predict(features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.7534117647058823

In [206]:
# 使用组合特征训练
mixed_features = np.concatenate([features, graph_features], axis=1)
xgb_clf = xgboost.XGBClassifier(max_depth=2, n_estimators=200)
xgb_clf.fit(mixed_features[train_index], labels[train_index])
y_pred = xgb_clf.predict(mixed_features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.6672941176470588

In [211]:
from sklearn.svm import SVC
svm_clf = SVC(gamma='scale')
svm_clf.fit(graph_features[train_index], labels[train_index])
y_pred = svm_clf.predict(graph_features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.34494117647058825

In [210]:
from sklearn.svm import SVC
svm_clf = SVC(gamma='scale')
svm_clf.fit(features[train_index], labels[train_index])
y_pred = svm_clf.predict(features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.7242352941176471

In [209]:
from sklearn.svm import SVC
svm_clf = SVC(gamma='scale')
svm_clf.fit(mixed_features[train_index], labels[train_index])
y_pred = svm_clf.predict(mixed_features[test_index])
sklearn.metrics.accuracy_score(y_pred, labels[test_index])

0.7063529411764706