# word2vec的实现
资料可参考 reference material。
主要参考知乎 https://zhuanlan.zhihu.com/p/26306795

 首先实现基于 skip gram 的模型。
 skip-gram 最基本的方法是使用当前词预测下一个词。
 其训练本质是一个分类问题，假设词表大小是V,那么输入就是V维 one-hot 向量，经过一个隐藏层，维度为N,然后做一个V分类。
 可以看出如果直接优化这个任务，会非常难以训练。现实中 V 可能会很大，从而导致很难训练。word2vec 用了一些训练 trick，如*hierarchical softmax*,这里先不考虑，只考虑最简单的。
 ![abc](image/skip-grim.png)

# 导入数据

In [1]:
import os
import json
import re
import jieba
from tqdm import tqdm
from langconv import *
file_path = os.getcwd() + '/extracted/AA/'
file_names = os.listdir(file_path)
def token(string):
    return ' '.join(re.findall('[\w|\d]+', string))
# 转换繁体到简体
def cht_to_chs(line):
    line = Converter('zh-hans').convert(line)
    line.encode('utf-8')
    return line

all_articles = []
# 数据量过大 仅用2个200M一共400M数据来构建
for file_name in file_names[:1]:
    with open(file_path+file_name, encoding='utf-8') as fo:
        for article in tqdm(fo.readlines()):
            all_articles.append(cht_to_chs(token(json.loads(article)['text'].strip())))

def cut(str): 
    result = list(jieba.cut(str))
    return result

sentences = []
for s in tqdm(all_articles):
    sentences.append(cut(s))

100%|██████████| 9448/9448 [00:11<00:00, 801.85it/s]
  0%|          | 0/9448 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.611 seconds.
Prefix dict has been built succesfully.
100%|██████████| 9448/9448 [00:09<00:00, 895.60it/s]


# 实现

## one-hot encoding

In [2]:
from collections import Counter
min_count = 5
cnt = Counter(sum(sentences,[]))
print(len(cnt))
f = list(filter(lambda x:x[1] >= min_count,cnt.items()))
f = [x[0] for x in f]
print(len(f))

sentences = sentences
tokens = set()
for s in sentences:
    tokens = tokens | set(s)
len(tokens)

97219
19767


97219

In [3]:
word2index = {w:i for i,w in enumerate(f)}
index2word = {i:w for w,i in word2index.items()}
X = []
Y = []
window = 2
for s in tqdm(sentences):
    for i in range(window,len(s) - window):
        if s[i] not in f:continue
        X.append([word2index.get(k,0) for k in s[i-window:i]] 
                 + [word2index.get(k,0) for k in s[i+1:i+window+1]])
        Y.append(word2index[s[i]])

100%|██████████| 9448/9448 [01:56<00:00, 81.44it/s]


In [302]:
print(len(X))
print(len(f))

952502
19767


## Define Dataset and Dataloader

In [39]:
from torch.utils.data import Dataset, DataLoader
import torch
class Sample(Dataset):
    def __init__(self,x,y):
        self.x = torch.LongTensor(x)
        self.y = torch.LongTensor(y)
    
    def __getitem__(self,index):
        return self.x[index],self.y[index]
    
    def __len__(self):
        return len(self.y)
sn = len(X) 
dset = Sample(X[:sn],Y[:sn])
loader = DataLoader(dset,batch_size=5000,shuffle=False)
print(len(X))

952502


## Define Model

In [99]:
import torch
import torch.nn as nn
import torch.nn.functional as F
token_size = len(word2index)
class Word2Vec(nn.Module):
    def __init__(self,vsz=token_size,emd=100,cls_num=token_size):
        super(Word2Vec,self).__init__()
        self.e = nn.Embedding(vsz,emd)
#         self.hidden = nn.Linear(emd*4,emd)
        self.clf = nn.Linear(emd,19767)
        self.e.weight.data.uniform_(-0.1,0.1)
    
    def forward(self,x):
        x = self.e(x)
        x = torch.mean(x,1)
        x = self.clf(x)
        return x 

class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.e = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.e(inputs).view((inputs.shape[0], -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        return out

## Define training process

In [127]:
import torch
import pickle as pkl
from torch.autograd import Variable
epoch = 100 
m = CBOW(len(word2index),100,2,64)
# m = Word2Vec()
m = m.cuda()
loss_func = nn.CrossEntropyLoss()
optim = torch.optim.Adam(m.parameters(),lr=0.01)
l = []
print(m)
print('start training...')
def train():
    pb = tqdm(range(epoch))
    for e in pb:
        bl = [] 
        for b in loader:
            bx,by = b[0],b[1]
            bx,by = bx.cuda(),by.cuda()
            optim.zero_grad()
            o = m(bx)

            loss = loss_func(o,by)
            loss.backward()
            optim.step()
            bl.append(loss.item())
        l.append(np.mean(bl))
        pb.set_description(f'Epoch:{e+1}: Loss = {l[-1]}')
#         print(f'epoch {e+1} : {l[-1]}')
train()
with open('./saved_files/e.pkl','wb') as f:
    pkl.dump(m.e,f)




  0%|          | 0/100 [00:00<?, ?it/s][A[A[A

CBOW(
  (e): Embedding(19767, 100)
  (linear1): Linear(in_features=400, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=19767, bias=True)
)
start training...





Epoch:1: Loss = 6.432796148729574:   0%|          | 0/100 [00:09<?, ?it/s][A[A[A


Epoch:1: Loss = 6.432796148729574:   1%|          | 1/100 [00:09<15:28,  9.38s/it][A[A[A


Epoch:2: Loss = 5.393951377319416:   1%|          | 1/100 [00:18<15:28,  9.38s/it][A[A[A


Epoch:2: Loss = 5.393951377319416:   2%|▏         | 2/100 [00:18<15:24,  9.43s/it][A[A[A


Epoch:3: Loss = 4.927613715226737:   2%|▏         | 2/100 [00:28<15:24,  9.43s/it][A[A[A


Epoch:3: Loss = 4.927613715226737:   3%|▎         | 3/100 [00:28<15:12,  9.40s/it][A[A[A


Epoch:4: Loss = 4.58902070784444:   3%|▎         | 3/100 [00:37<15:12,  9.40s/it] [A[A[A


Epoch:4: Loss = 4.58902070784444:   4%|▍         | 4/100 [00:37<15:00,  9.38s/it][A[A[A


Epoch:5: Loss = 4.319108587284987:   4%|▍         | 4/100 [00:47<15:00,  9.38s/it][A[A[A


Epoch:5: Loss = 4.319108587284987:   5%|▌         | 5/100 [00:47<14:55,  9.43s/it][A[A[A


Epoch:6: Loss = 4.102122093370448:   5%|▌         | 5/100 [00:56<1

In [135]:
import matplotlib.pyplot as plt
%matplotlib widget

plt.plot(l)
plt.show()

# gensim 数据接口

In [66]:
from gensim.models import word2vec # word2vec 模型
w2v_model = word2vec.Word2Vec(sentences,min_count=5,workers=50,size=100)

In [130]:
import numpy as np
from sklearn.preprocessing import normalize

def similar(v1,v2): 
    return np.dot(v1,v2) / (np.linalg.norm(v1)*(np.linalg.norm(v2)))

e = None

with open('./saved_files/e.pkl','rb') as f:
    e = pkl.load(f)
    e = e.weight.detach().cpu().numpy()
# e = normalize(e)
q = '表示'
print(e.shape)
s = {}
for w,i in word2index.items():
    s[w] = similar(e[i],e[word2index[q]])
s = sorted(s.items(),key=lambda x:x[1],reverse=True)[:10]
print(s)
print(w2v_model.wv.most_similar(q))

(19767, 100)
[('表示', 1.0), ('学习', 0.39887816), ('事实上', 0.39036024), ('府兵制', 0.38728768), ('注目', 0.37904167), ('帮忙', 0.36899132), ('说', 0.36732107), ('降线', 0.36457083), ('动弹', 0.3491337), ('到来', 0.33478066)]
[('当', 0.9508322477340698), ('因此', 0.9364513158798218), ('因为', 0.9351018667221069), ('但是', 0.9335132837295532), ('不', 0.9290002584457397), ('更', 0.9285000562667847), ('不过', 0.9284607768058777), ('要', 0.9281373023986816), ('玩家', 0.9277800917625427), ('如果', 0.926964521408081)]


In [132]:
from pprint import pprint
def search(query,depth=3):
    s = {}
    for d in range(depth):
        for w,i in word2index.items():
            s[w] = similar(e[i],e[word2index[query]])
    s = sorted(s.items(),key=lambda x:x[1],reverse=True)
    pprint(s)
search('说') 

[('说', 1.0000001),
 ('width', 0.40772444),
 ('说明', 0.38674474),
 ('结婚', 0.36877435),
 ('表示', 0.36732107),
 ('平时', 0.347666),
 ('恶化', 0.3454516),
 ('这份', 0.33920467),
 ('罪行', 0.33910623),
 ('发财', 0.33235258),
 ('传说', 0.32815892),
 ('过多', 0.32585266),
 ('勉强', 0.32509974),
 ('赐死', 0.32340148),
 ('放出', 0.31981236),
 ('就算', 0.31840214),
 ('刘璐', 0.31594908),
 ('笑声', 0.3157181),
 ('签订', 0.31382456),
 ('五百年', 0.31307116),
 ('招待会', 0.3115308),
 ('回复', 0.31109095),
 ('一封', 0.31086218),
 ('右侧', 0.30819833),
 ('阿拜', 0.30544934),
 ('接纳', 0.30532485),
 ('麻烦', 0.30453026),
 ('保释', 0.30380973),
 ('据传', 0.30370122),
 ('随', 0.303036),
 ('以太', 0.30217257),
 ('提问', 0.3007648),
 ('一样', 0.30020744),
 ('曾孙', 0.29988503),
 ('着重', 0.29829335),
 ('离职', 0.29787835),
 ('指称', 0.2977484),
 ('平托', 0.29761901),
 ('水量计', 0.2966707),
 ('形容', 0.29573876),
 ('协调', 0.29558215),
 ('出来', 0.29532993),
 ('吕君昌', 0.29486752),
 ('帮助', 0.29108924),
 ('mL', 0.28893688),
 ('支撑', 0.28806764),
 ('渴望', 0.28754744),
 ('吗', 0.2866177),
