# word2vec的实现
资料可参考 reference material。
主要参考知乎 https://zhuanlan.zhihu.com/p/26306795

 首先实现基于 skip gram 的模型。
 skip-gram 最基本的方法是使用当前词预测下一个词。
 其训练本质是一个分类问题，假设词表大小是V,那么输入就是V维 one-hot 向量，经过一个隐藏层，维度为N,然后做一个V分类。
 可以看出如果直接优化这个任务，会非常难以训练。现实中 V 可能会很大，从而导致很难训练。word2vec 用了一些训练 trick，如*hierarchical softmax*,这里先不考虑，只考虑最简单的。
 ![abc](image/skip-grim.png)

# 导入数据

In [194]:
import os
import json
import re
import jieba
from tqdm import tqdm
from langconv import *
file_path = os.getcwd() + '/extracted/AA/'
file_names = os.listdir(file_path)
def token(string):
    return ' '.join(re.findall('[\w|\d]+', string))
# 转换繁体到简体
def cht_to_chs(line):
    line = Converter('zh-hans').convert(line)
    line.encode('utf-8')
    return line

all_articles = []
# 数据量过大 仅用2个200M一共400M数据来构建
for file_name in file_names[:1]:
    with open(file_path+file_name, encoding='utf-8') as fo:
        for article in tqdm(fo.readlines()):
            all_articles.append(cht_to_chs(token(json.loads(article)['text'].strip())))

def cut(str): 
    result = list(jieba.cut(str))
    return result

sentences = []
for s in tqdm(all_articles):
    sentences.append(cut(s))

100%|██████████| 9448/9448 [00:11<00:00, 653.36it/s]
100%|██████████| 9448/9448 [00:08<00:00, 1106.34it/s]


# 实现

## one-hot encoding

In [196]:
# 只取 5000 个句子
sentences = sentences
tokens = set()
for s in sentences:
    tokens = tokens | set(s)
len(tokens)

97219

In [197]:
word2index = {w:i for i,w in enumerate(tokens)}
index2word = {i:w for w,i in word2index.items()}

In [198]:
X = []
Y = []
window = 2
for s in tqdm(sentences):
    for i in range(window,len(s) - window):
        X.append([word2index[k] for k in s[i-window:i]] 
                 + [word2index[k] for k in s[i+1:i+window+1]])
        Y.append(word2index[s[i]])


100%|██████████| 9448/9448 [00:02<00:00, 3508.82it/s]


## Define Dataset and Dataloader

In [214]:
from torch.utils.data import Dataset, DataLoader
import torch
class Sample(Dataset):
    def __init__(self,x,y):
        self.x = torch.LongTensor(x)
        self.y = torch.LongTensor(y)
    
    def __getitem__(self,index):
        return self.x[index],self.y[index]
    
    def __len__(self):
        return len(self.y)

dset = Sample(X,Y)
loader = DataLoader(dset,batch_size=3000,shuffle=False)

## Define Model

In [211]:
import torch
import torch.nn as nn
import torch.nn.functional as F
token_size = len(word2index)
class Word2Vec(nn.Module):
    def __init__(self,vsz=token_size,emd=100,cls_num=token_size):
        super(Word2Vec,self).__init__()
        self.e = nn.Embedding(vsz,emd)
        self.hidden = nn.Linear(emd*4,emd)
        self.clf = nn.Linear(emd,cls_num)
        self.e.weight.data.uniform_(-0.1,0.1)
    
    def forward(self,x):
        x = self.e(x)
#         x = torch.sum(x,1)
        x = x.view((x.shape[0],-1))
        x = self.hidden(x)
        x = self.clf(x)
        return x 
    
class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.embeddings(inputs).view((1, -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        log_probs = F.log_softmax(out)
        return log_probs

## Define training process

In [None]:
import torch
import pickle as pkl
from torch.autograd import Variable
epoch = 50
# m = CBOW(len(word2index),100,2,64)
m = Word2Vec()
m = m.cuda()
loss_func = nn.CrossEntropyLoss()
optim = torch.optim.SGD(m.parameters(),lr=0.1,mo)
l = []
print(m)
print('start training...')
def train():
    for e in range(epoch):
        bl = 0 
        for b in loader:
            bx,by = b[0],b[1]
            bx,by = Variable(bx).cuda(),Variable(by).cuda()
            optim.zero_grad()
            o = m(bx)

            loss = loss_func(o,by)
            loss.backward()
            optim.step()
            bl += loss.item()
        l.append(bl)
        print(f'epoch {e+1} : {l[-1]}')
train()
with open('./saved_files/e.pkl','wb') as f:
    pkl.dump(m.e,f)

Word2Vec(
  (e): Embedding(97219, 100)
  (hidden): Linear(in_features=400, out_features=100, bias=True)
  (clf): Linear(in_features=100, out_features=97219, bias=True)
)
start training...
epoch 1 : 3528.148344039917
epoch 2 : 3167.2226452827454
epoch 3 : 3027.433641910553
epoch 4 : 2931.926839351654
epoch 5 : 2864.992835521698
epoch 6 : 2811.6909680366516
epoch 7 : 2766.0571932792664
epoch 8 : 2726.707187652588
epoch 9 : 2693.8013215065002
epoch 10 : 2666.2686433792114
epoch 11 : 2642.6601638793945
epoch 12 : 2622.0766978263855
epoch 13 : 2604.0904779434204
epoch 14 : 2588.291410923004
epoch 15 : 2574.218727350235
epoch 16 : 2561.470993757248
epoch 17 : 2549.7606110572815


In [50]:
def similar(v1,v2): 
    return np.dot(v1,v2) / np.sqrt(np.square(v1).sum()) * np.sqrt(np.square(v2).sum())
e = None
with open('./saved_files/e.pkl','rb') as f:
    e = pkl.load(f)
    e = e.weight.detach().cpu().numpy()
w = '说'
s = {}
for w,i in word2index.items():
    s[w] = similar(e[i],e[word2index[w]])
s = sorted(s.items(),key=lambda x:x[1],reverse=True)
print(s[:10])
    

[('1363', 0.45734346), ('卢思满', 0.45113978), ('越来越', 0.4503107), ('哈普沙', 0.44950783), ('A75', 0.44781357), ('配音', 0.4465693), ('贲张', 0.44448572), ('学位证书', 0.4442458), ('references', 0.4418396), ('枪枝', 0.44071117)]


# gensim 数据接口

In [61]:
from gensim.models import word2vec # word2vec 模型
w2v_model = word2vec.Word2Vec(sentences,min_count=5,workers=50,size=100)
w2v_model.wv.most_similar('村')
print(similar(w2v))

[('家村', 0.9634177684783936),
 ('山村', 0.9586061835289001),
 ('西村', 0.9522800445556641),
 ('周家', 0.9470921158790588),
 ('东村', 0.9461714029312134),
 ('塘村', 0.9391768574714661),
 ('花园村', 0.9317656755447388),
 ('枫树', 0.927894115447998),
 ('新村', 0.8992545008659363),
 ('湖村', 0.8913074135780334)]