In [1]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [40]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import numpy as np
from collections import Counter
sys.path.append("../d2l_func/")
from sqdm import sqdm

In [41]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PATHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

## word2vec 

### 载入数据/建立索引

In [8]:
# 加载数据
with open("../data/ptb/ptb.train.txt", "r+") as f:
    lines = f.readlines()
    # split传入的值为空时，分割空格，包括"\n"
    corpus = [sentence.split() for sentence in lines]

In [12]:
# 统计各个token的出现次数
token_counter = Counter([token for sentence in corpus for token in sentence])
# 去除出现5次以下的token
token_counter = dict(filter(lambda x: x[1] >= 5, token_counter.items()))
# 词表
vocab_set = list(token_counter.keys())
# 词表索引化
token_to_idx = {token:idx for idx, token in enumerate(vocab_set)}
# corpus索引化
corpus_index = [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in corpus]
# 统计数量
token_num = sum([len(sentence) for sentence in corpus_index])
print(token_num)

887100


### 二次采样

文本数据中一般会出现一些高频词。通常来说，在一个背景窗口中，一个词和较低词频的词同时出现比和较高词频同时出现对训练词嵌入模型更加有益处
- 因此，在训练词嵌入模型的时候，可以对词进行二次采样
- 实际上，就是对每一个索引词$w_i$以一定的概率丢弃，其中$f_w$是中心词$w_i$在数据集中的个数与总词数之比，t通常取1e-4
$$P(w_i) = max(1 - \sqrt{\frac{t}{f_w}}, 0)$$
- 当$f(w_i) > t$的时候，才有可能在二次采样中丢弃词$w_i$，越是高频的词被丢弃的概率越大

In [14]:
def discard(index, theta=1e-4):
    """
    function: 二次采样中是否丢弃词，True时丢弃
    params index: 传入的是词索引
    """
    return np.random.uniform(0, 1) < (1 - np.sqrt(theta / token_counter[vocab_set[index]] * token_num))

# 二次采样
subsampling_corpus_index = [[token for token in sentence if not discard(token)] for sentence in corpus_index]
subsampling_token_num = sum([len(sentence) for sentence in subsampling_corpus_index])
print(subsampling_token_num)

375603


比较二次采样前后，高频词和低频词的数量变化

In [20]:
[1, 2, 3].count(1)

1

In [21]:
def compare_count(token):
    """
    function: 比较二次采样前后的词数量变化
    params token: token是词，像"the"/"join"等
    """
    token_index = token_to_idx[token]
    before = sum([sentence.count(token_index) for sentence in corpus_index])
    after = sum([sentence.count(token_index) for sentence in subsampling_corpus_index])
    print(f"{token}, before:{before}, after:{after}")

In [22]:
compare_count("the")
compare_count("join")

the, before:50770, after:2124
join, before:45, after:45


### 提取中心词和上下文词

提取与中心词距离不超过背景窗口大小的词作为上下文词

In [30]:
def get_center_and_context(corpus_index, max_window_size):
    """
    function: 提取中心词和上下文词
    params corpus_index: 索引化后的corpus
    params max_window_size: 最大背景窗口
    """
    # 用于存放提取的中心词和上下文词
    all_centers, all_contexts = [], []
    
    for sentence in corpus_index:
        # 由于最少需要(中心词，背景词),即sentence的最小长度为2
        if len(sentence) < 2:
            continue
        # 添加中心词，实际上sentence中的每一个词都可以作为中心词
        all_centers += sentence
        # 添加背景词
        for index in range(len(sentence)):
            # 在1-最大背景窗口中采样一个背景窗口
            window_size = np.random.randint(1, max_window_size+1)
            # 获取背景词的索引
            index_list = list(np.arange(max(0, index-window_size), min(len(sentence), index+window_size+1)))
            # 去掉当前中心词的索引
            index_list.remove(index)
            all_contexts.append(list(np.array(sentence)[index_list]))
    return all_centers, all_contexts

In [33]:
# 验证
set_seed(2020)
# 创建人工数据集
tiny_dataset = [list(range(7)), list(range(7, 10))]
print("dataset", tiny_dataset)
for center, context in zip(*get_center_and_context(tiny_dataset, 2)):
    print(f"center: {center}, has contexts: {context}")

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center: 0, has contexts: [1]
center: 1, has contexts: [0, 2]
center: 2, has contexts: [0, 1, 3, 4]
center: 3, has contexts: [2, 4]
center: 4, has contexts: [2, 3, 5, 6]
center: 5, has contexts: [3, 4, 6]
center: 6, has contexts: [4, 5]
center: 7, has contexts: [8, 9]
center: 8, has contexts: [7, 9]
center: 9, has contexts: [8]


提取最大窗口为5的中心词和背景词

In [34]:
all_centers, all_contexts = get_center_and_context(subsampling_corpus_index, 5)
len(all_centers)
len(all_contexts)

374633

374633

### 负采样

使用负采样来近似训练，对于一对中心词和背景词，我们随机采样k个噪声词
- 噪声词的采样频率$P(w)$设为w词频和总词频之比的0.75次方

In [37]:
def get_negative(all_contexts, sample_weight, k):
    """
    function: 实现负采样
    params all_contexts: 所有的背景词
    params sample_weight: 词表中的采样权重
    params k: 采样倍数
    """
    all_negatives, negatives_candidate, i = [], [], 0
    # 总词表的长度
    population = list(range(len(sample_weight)))
    
    for context in all_contexts:
        negatives = []
            
        while len(negatives) < len(context) * k:
            if i == len(negatives_candidate):
                negatives_candidate, i = random.choices(population, sample_weight, k=int(1e5)), 0
            # 更新
            neg, i = negatives_candidate[i], i+1
            # 噪声词不能是背景词(其实，在这里可以看出来word2vec并没有考虑语序的)
            if neg not in context:
                negatives.append(neg)
                
        all_negatives.append(negatives)
    return all_negatives

In [38]:
sample_weight = [token_counter[w] ** 0.75 for w in vocab_set]
all_negatives = get_negative(all_contexts, sample_weight, k=5)
len(all_negatives)

374633

### 定义数据类

In [45]:
class Dataset(Data.Dataset):
    def __init__(self, all_centers, all_contexts, all_negatives):
        assert len(all_centers) == len(all_contexts) == len(all_negatives)
        self.centers = all_centers
        self.contexts = all_contexts
        self.negatives = all_negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    
    def __len__(self):
        return len(self.centers)

因为语料是不定长的，所以在训练之前，需要填充0。此外，为了防止在训练的时候，填充的部分不会对损失函数造成影响，需要进行mask
- 另外，通常把背景词和噪声词拼接起来，因此需要区分哪些是背景词，哪些是噪声词ℹi

In [46]:
def batchify(data):
    """
    function: 定义每个批数据是怎么运算的，传入的数据shape是(batch_size, list)，这个list由Dataset来定义
    """
    # 背景词和噪声词的最大长度
    max_len = max([len(c) + len(n) for _, c, n in data])
    # init
    centers, context_negatives, masks, labels = [], [], [], []
    
    for center, context, negative in data:
        # center: int, context.shape: window_size的倍数(list), negative.shape: k倍的context.shape(list)
        # 当前批中，一条数据背景词和噪声词的长度
        cur_len = len(context) + len(negative)
        # 添加中心词
        centers += [center]
        # 添加背景词和噪声词（一个列表对应着一个中心词(int)）
        context_negatives += [context + negative + [0] * (max_len - cur_len)]
        # 添加mask
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        # 添加label，背景词是正类，噪声词和填充词是负类
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(context_negatives), 
            torch.tensor(masks), torch.tensor(labels))

In [47]:
dataset = Dataset(all_centers, all_contexts, all_negatives)
# 生成迭代器
train_iter = Data.DataLoader(dataset, batch_size=512, collate_fn=batchify, num_workers=4, shuffle=True)

# 验证
for batch in train_iter:
    for name, data in zip(["centers", "context_negatives", "masks", "labels"], batch):
        print(name, data.shape)
    break

centers torch.Size([512, 1])
context_negatives torch.Size([512, 60])
masks torch.Size([512, 60])
labels torch.Size([512, 60])


### skip_gram

skip_gram模型实际上就是用中心词来预测周围词

In [53]:
def skip_gram(centers, context_negatives, embed_u, embed_v):
    """
    function: 实现skip_gram model
    params centers: 中心词, shape-->(b, 1)
    params context_negatives: 背景词+噪声词, shape-->(b, max_len)
    params embed_u: 中心词的embedding, shape-->(len(vocab_set), d_model), d_model为词向量的维度
    params embed_v: 背景词+噪声词的embedding, shape-->(len(vocab_set)
    """
    # 通过词索引进行向量化
    centers = embed_u(centers) # shape--> (b, 1, d_model)
    context_negatives = embed_v(context_negatives) # shape--> (b, max_len, d_model)
    # 矩阵相乘
    pred = torch.bmm(centers, context_negatives.transpose(1, 2))
    return pred

### 训练

In [150]:
# 定义loss(二元交叉熵损失)
class BinarySigmoidEntropyLoss(nn.Module):
    def __init__(self):
        super(BinarySigmoidEntropyLoss, self).__init__()
    
    def forward(self, y_pred, label, mask=None):
        # mask是掩码，使得填充项不参与计算
        if mask is not None:
            mask = mask.float()
        y_pred, label = y_pred.float(), label.float()
        # 和BCEWithLogitsLoss的作用一样
        loss = F.binary_cross_entropy_with_logits(y_pred, label, weight=mask, reduction="none")
        if mask is not None:
            # 对mask部分进行修正
            loss = loss.mean(dim=1) * mask.shape[1] / mask.sum(dim=1)
        return loss.mean()

In [151]:
# 训练
def trainer(epoch_num, batch_num, data_iter, model, embed, loss, lr, device):
    # training bar
    process_bar = sqdm()
    # define embedding params
    embed = embed.to(device)
    # define optimizer
    optimizer = torch.optim.Adam(embed.parameters(), lr=lr)
    
    for epoch in range(epoch_num):
        l_sum, count = 0., 0.
        print(f"Epoch [{epoch+1}/{epoch_num}]")
        for batch in data_iter:
            # to device
            centers, context_negatives, mask, label = [d.to(device) for d in batch]
            # model
            y_pred = model(centers, context_negatives, embed[0], embed[1])
            # loss, y_pred.shape is (batch, 1, max_len), label.shape is (batch, max_len)
            l = loss(y_pred.view(label.shape), label, mask)
            # grad
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            
            l_sum += l.item()
            count += 1
            # training bar
            process_bar.show_process(batch_num, 1, l_sum/count)
        print("\n")

In [159]:
# 初始化embedding
d_model = 100

embed = nn.Sequential(
    nn.Embedding(len(vocab_set), d_model),
    nn.Embedding(len(vocab_set), d_model),
)

# init train_iter
dataset = Dataset(all_centers, all_contexts, all_negatives)
train_iter = Data.DataLoader(dataset, batch_size=512, collate_fn=batchify, num_workers=4, shuffle=True)

# loss
loss = BinarySigmoidEntropyLoss()

In [160]:
params = {
    "epoch_num": 10,
    "batch_num": len(list(train_iter)),
    "data_iter": train_iter,
    "model": skip_gram,
    "embed": embed,
    "loss": loss,
    "lr": 0.01,
    "device": "cuda"
}

In [161]:
# training 
trainer(**params)

Epoch [1/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 1.9712, train_score: -, test_loss: -, test_score: -

Epoch [2/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.6237, train_score: -, test_loss: -, test_score: -

Epoch [3/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.4501, train_score: -, test_loss: -, test_score: -

Epoch [4/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.3951, train_score: -, test_loss: -, test_score: -

Epoch [5/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.3692, train_score: -, test_loss: -, test_score: -

Epoch [6/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.3535, train_score: -, test_loss: -, test_score: -

Epoch [7/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.3419, train_score: -, test_loss: -, test_score: -

Epoch [8/10]
732/732 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - train_loss: 0.3325, train_score: -, test_loss: -, test_score: -

Epoch [9/10]
732/732 [>>>>>>>>>>

### 应用词嵌入模型

可以根据两个词向量的余弦相似度表示词与词在语义上的相似度

In [162]:
def get_similar_token(query_token, k, embed):
    # 中心词的词向量, shape-->(len(vocab_set), d_model)
    w = embed.weight.data
    # shape-->(d_model, )
    x = w[token_to_idx[query_token]]
    # 添加1e-9增加稳定性, 求余弦相似度
    cos = torch.matmul(w, x) / (torch.sum(w * w, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
#     cos = torch.mv(w, x) / (torch.norm(w, dim=1) * torch.norm(x))
    # 返回的是相似度，以及对应的索引,其中第一个是x本身（为1），所以如果返回3个最相近的词，实际上是传入4
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print("cosine sim=%.3f: %s" %(cos[i], vocab_set[i]))

In [167]:
get_similar_token("chip", 5, embed[0])

cosine sim=0.490: machines
cosine sim=0.456: caution
cosine sim=0.442: speculation
cosine sim=0.403: mainframes
cosine sim=0.396: cray


小结：
1. 二次采样试图尽可能减轻高频词对训练词嵌入模型的影响
2. 可以将长度不同的样本填充到长度相同的小批量，并通过掩码变量区分非填充和填充项，只让填充项参加损失函数的计算

## 使用gensim来训练

In [170]:
from gensim.models import word2vec

In [173]:
model = word2vec.Word2Vec(corpus, size=100, alpha=0.01, window=5, min_count=5, sg=1, hs=0, negative=5)

In [176]:
for word in model.wv.similar_by_word("chip", topn=5):
    # word, sim
    print(f"{word[0]}, sim:{word[1]}")

engine, sim:0.9848711490631104
charging, sim:0.9845905303955078
mainframe, sim:0.9842156171798706
suspension, sim:0.9840888381004333
dentsu, sim:0.98377925157547
