From [aditya-grover/node2vec](https://github.com/aditya-grover/node2vec/)

论文笔记：[node2vec, Scalable Feature Learning for Networks Note | Yam](https://yam.gift/2020/03/30/Paper/2020-03-30-Node2Vec/)

In [1]:
import numpy as np
import networkx as nx
from gensim.models import Word2Vec
import random
import jieba
from collections import defaultdict

## 构造数据

这里用 TextRank 类似的方法构造。

In [2]:
with open("./sample.txt", "r") as f:
    text = f.read()[:10000]
# text = "我爱你，你爱我，她爱我，你爱北京天安门。"

In [3]:
text_list = jieba.lcut(text)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/y0/t47pd_bj3b9_3j9n_56m52gw0000gn/T/jieba.cache
Loading model cost 1.976 seconds.
Prefix dict has been built successfully.


In [4]:
cm = defaultdict(int)
win = 2
num_token = len(text_list)

In [5]:
for i, w in enumerate(text_list):
    for j in range(i+1, i+win):
        if j >= num_token:
            break
        cm[(w, text_list[j])] += 1

In [6]:
graph_data = []
for key,value in cm.items():
    item = (key[0], key[1], value)
    graph_data.append(item)

In [7]:
G = nx.Graph()
G.add_weighted_edges_from(graph_data)

In [8]:
G.number_of_nodes(), G.number_of_edges()

(1785, 4820)

## 预处理

In [9]:
def alias_setup(probs):
    K = len(probs)
    q = np.zeros(K)
    J = np.zeros(K, dtype=np.int)

    smaller = []
    larger = []
    for kk, prob in enumerate(probs):
        q[kk] = K*prob
        if q[kk] < 1.0:
            smaller.append(kk)
        else:
            larger.append(kk)

    while len(smaller) > 0 and len(larger) > 0:
        small = smaller.pop()
        large = larger.pop()

        J[small] = large
        q[large] = q[large] + q[small] - 1.0
        if q[large] < 1.0:
            smaller.append(large)
        else:
            larger.append(large)
    return J, q

def alias_draw(J, q):
    K = len(J)
    kk = int(np.floor(np.random.rand()*K))
    if np.random.rand() < q[kk]:
        return kk
    else:
        return J[kk]

In [10]:
alias_nodes = {}
for node in G.nodes():
    unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
    norm_const = sum(unnormalized_probs)
    normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
    alias_nodes[node] = alias_setup(normalized_probs)

In [11]:
p, q = 1, 1
def get_alias_edge(src, dst):
    unnormalized_probs = []
    for dst_nbr in sorted(G.neighbors(dst)):
        if dst_nbr == src:
            unnormalized_probs.append(G[dst][dst_nbr]['weight']/p)
        elif G.has_edge(dst_nbr, src):
            unnormalized_probs.append(G[dst][dst_nbr]['weight'])
        else:
            unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
    norm_const = sum(unnormalized_probs)
    normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
    return alias_setup(normalized_probs)

In [12]:
alias_edges = {}
triads = {}

for edge in G.edges():
    alias_edges[edge] = get_alias_edge(edge[0], edge[1])
    alias_edges[(edge[1], edge[0])] = get_alias_edge(edge[1], edge[0])

## 随机游走

In [13]:
num_walks = 10
walk_length = 80

In [14]:
def simulate_walks(num_walks, walk_length):
    walks = []
    nodes = list(G.nodes())
    print ('Walk iteration:')
    for walk_iter in range(num_walks):
        print (str(walk_iter+1), '/', str(num_walks))
        random.shuffle(nodes)
        for node in nodes:
            walks.append(node2vec_walk(walk_length=walk_length, start_node=node))
    return walks

In [15]:
def node2vec_walk(walk_length, start_node):
    walk = [start_node]
    while len(walk) < walk_length:
        cur = walk[-1]
        cur_nbrs = sorted(G.neighbors(cur))
        if len(cur_nbrs) > 0:
            if len(walk) == 1:
                walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
            else:
                prev = walk[-2]
                nxt = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], alias_edges[(prev, cur)][1])]
                walk.append(nxt)
        else:
            break
    return walk

In [16]:
walks = simulate_walks(num_walks, walk_length)

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


## 训练

In [21]:
np.array(walks).shape

(17850, 80)

In [17]:
len(walks)

17850

In [18]:
window_size = 10
dimensions = 128

def learn_embeddings(walks):
    walks = [list(map(str, walk)) for walk in walks]
    model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=8, iter=1)
    return model

In [19]:
model = learn_embeddings(walks)

In [20]:
model.wv.similar_by_word("伦理")

[('问题', 0.8225580453872681),
 ('原则', 0.7723050117492676),
 ('虽难', 0.7416102290153503),
 ('讨论', 0.718680739402771),
 ('）', 0.6941202878952026),
 ('吵吵闹闹', 0.6421607732772827),
 ('社会', 0.6192087531089783),
 ('荒谬', 0.6081787943840027),
 ('之一', 0.6002324223518372),
 ('两者', 0.592690110206604)]

In [22]:
len(model.wv.vocab)

1785