# 导入包

In [1]:
import time
import numpy as np
import torch
import torch.nn  as nn
import torch.nn.functional as F


# CBOW

In [2]:
class CBOW(nn.Module):  # 用于上下文求中心词
    def __init__(self, vocab_size, embedding_dim=128):
        super(CBOW, self).__init__()
        # 用于将词汇表中的每个词映射到一个固定维度的向量
        self.emb_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # 用于将嵌入向量转换为词汇表大小的输出，这意味着线性层的输出将是一个向量，其长度与词汇表的大小相同，每个元素代表当前上下文预测词汇表中每个词的概率
        # 将维度再改到与vocabulary 并用sigmoid投射到(0, 1) 表示概率
        self.output_layer = nn.Linear(in_features=embedding_dim, out_features=vocab_size)  # 参数量：embedding_dim*vocab_size

    def forward(self, x):
        """
        前向过程
        :param x: [N,T] long
        :return:
        """
        z1 = self.emb_layer(x)  # [N,T] --> [N,T,embedding_dim]
        z2 = torch.mean(z1, dim=1)  # [N,T,embedding_dim] --> [N,embedding_dim]
        scores = self.output_layer(z2)  # [N,embedding_dim] --> [N,vocab_size]  得到的是每个样本对应各个单词类别的置信度（概率）
        return scores

## scores

In [3]:
vocab_size = 50000  # 词汇表大小，也就是单词类别数目
batch_size=16  # 定义了批处理的大小，即每次输入到模型的数据样本数
window_size = 4     # 多个老师教一个学生

net = CBOW(vocab_size=vocab_size, embedding_dim=128)
# 形状为[batch_size, window_size]，即[16, 4]。这个张量包含了从0到vocab_size-1（即0到49999）之间的随机整数，代表词汇表中的单词索引
x = torch.randint(vocab_size, size=(batch_size, window_size), dtype=torch.long)  # [N,T]
# 形状为[batch_size]，即[16]。这个张量包含了从0到vocab_size-1之间的随机整数，代表每个样本的中心词索引
y = torch.randint(vocab_size, size=(batch_size,), dtype=torch.long)  # [N,]

scores = net(x)  # [N,vocab_size]
print(scores.shape)

torch.Size([16, 50000])


In [4]:
scores[0]

tensor([-0.0255,  0.1011,  0.3229,  ..., -0.1217,  0.1916,  0.1309],
       grad_fn=<SelectBackward0>)

## sigmoid & BCEWithLogitsLoss

In [6]:
# 损失：希望样本预测属于实际类别的置信度/概率要越大越好，如果可以的话，要求预测不属于实际类别的置信度越小越好
# 二分类
loss_fn = nn.BCEWithLogitsLoss()
y_onehot = F.one_hot(y, vocab_size).to(torch.float32)  # [N, vocab_size]
loss = loss_fn(scores, y_onehot)
print(loss)

tensor(0.7041, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [7]:
prob = torch.sigmoid(scores)  # [N, vocab_size]
loss2 = -torch.mean(y_onehot * torch.log(prob + 1e-8) + (1 - y_onehot) * torch.log(1.0 - prob + 1e-8))
print(loss2)

tensor(0.7041, grad_fn=<NegBackward0>)


In [8]:
res = y_onehot * torch.log(prob + 1e-8) + (1 - y_onehot) * torch.log(1.0 - prob + 1e-8)
res.shape

torch.Size([16, 50000])

## softmax & CrossEntropyLoss

In [9]:
# 多分类
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(scores, y)
print(loss)

tensor(10.7697, grad_fn=<NllLossBackward0>)


In [10]:
prob = torch.softmax(scores, dim=1)  # [N, vocab_size]
y_onehot = F.one_hot(y, vocab_size)  # [N, vocab_size]
loss2 = -torch.mean(torch.sum(y_onehot * torch.log(prob), dim=1))
print(loss2)

tensor(10.7697, grad_fn=<NegBackward0>)


# CBOW backward

In [11]:
import time

import torch
import torch.nn  as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128):
        super(CBOW, self).__init__()
        # 当前embedding layer和全连接中使用的是同一个w
        weight = nn.Parameter(torch.randn((vocab_size, embedding_dim), dtype=torch.float32))
        self.emb_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.output_layer = nn.Linear(in_features=embedding_dim, out_features=vocab_size, bias=False)
        self.emb_layer.weight = weight
        self.output_layer.weight = weight

    def forward(self, x):
        """
        前向过程
        :param x: [batch_size,window_size] long
        :return:
        """
        z1 = self.emb_layer(x)  # [batch_size,window_size] --> [batch_size,window_size,embedding_dim]
        z2 = torch.mean(z1, dim=1)  # [batch_size,window_size,embedding_dim] --> [batch_size,embedding_dim]
        scores = self.output_layer(z2)  # [batch_size,embedding_dim] --> [batch_size,vocab_size]  得到的是每个样本对应各个单词类别的置信度
        return scores

In [13]:
vocab_size = 500  # 词汇表大小，也就是单词类别数目
batch_size = 1
window_size = 4

net = CBOW(vocab_size=vocab_size, embedding_dim=128)
opt = optim.SGD(net.parameters(), lr=0.1)
for para in net.parameters():
    print(para.shape)
    print (para)

torch.Size([500, 128])
Parameter containing:
tensor([[ 0.5794,  0.5382,  0.0938,  ...,  0.3112, -1.4108,  0.3746],
        [-1.5388, -1.2390,  0.6408,  ..., -1.7857,  0.4686,  0.2899],
        [ 1.1819,  1.3431,  0.3455,  ..., -0.2958, -0.0559,  1.4537],
        ...,
        [ 0.3409, -1.5993,  1.0788,  ...,  1.0522, -0.6956,  0.2514],
        [ 0.5822,  0.3264, -0.0511,  ...,  0.1181, -1.2590,  0.4036],
        [ 1.0954,  0.6918, -0.3589,  ...,  1.1529,  0.1128, -0.4262]],
       requires_grad=True)


In [14]:
for para in net.parameters():
    print(para.shape)
    print (para)

torch.Size([500, 128])
Parameter containing:
tensor([[ 0.5794,  0.5382,  0.0938,  ...,  0.3112, -1.4108,  0.3746],
        [-1.5388, -1.2390,  0.6408,  ..., -1.7857,  0.4686,  0.2899],
        [ 1.1819,  1.3431,  0.3455,  ..., -0.2958, -0.0559,  1.4537],
        ...,
        [ 0.3409, -1.5993,  1.0788,  ...,  1.0522, -0.6956,  0.2514],
        [ 0.5822,  0.3264, -0.0511,  ...,  0.1181, -1.2590,  0.4036],
        [ 1.0954,  0.6918, -0.3589,  ...,  1.1529,  0.1128, -0.4262]],
       requires_grad=True)


In [15]:
para[0][0]

tensor(0.5794, grad_fn=<SelectBackward0>)

In [16]:
x = torch.tensor([[3, 5, 8, 1]], dtype=torch.long)
y = torch.tensor([3], dtype=torch.long)

scores = net(x)  # [n,vocab_size]
print(scores.shape)

torch.Size([1, 500])


In [17]:
# 损失：希望样本预测属于实际类别的置信度要越大越好，如果可以的话，要求预测不属于实际类别的置信度越小越好
y_onehot = F.one_hot(y, vocab_size).to(torch.float32)  # [n, vocab_size]
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(scores, y_onehot)
print(loss)

tensor(2.6414, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [18]:
prob = torch.sigmoid(scores)  # [n, vocab_size]
# loss2 = -torch.mean(torch.sum(y_onehot * torch.log(prob + 1e-8), dim=1))  # 只更新当前样本对应类别的参数w
loss2 = -torch.mean(y_onehot * torch.log(prob + 1e-8) + (1 - y_onehot) * torch.log(1.0 - prob + 1e-8))
print(loss2)

tensor(2.5586, grad_fn=<NegBackward0>)


In [19]:
opt.zero_grad()
loss2.backward()
opt.step()

In [20]:
print("debug查看梯度值")
for name, param in net.named_parameters():
    if param.requires_grad:
        print(name, param.grad)

debug查看梯度值
emb_layer.weight tensor([[-1.5216e-06, -1.6656e-07, -1.9131e-07,  ...,  4.1428e-08,
          6.4631e-07,  2.4276e-07],
        [-1.1703e-02,  8.0924e-03, -2.6683e-02,  ...,  2.1885e-03,
          2.6752e-03,  3.7523e-03],
        [-2.9923e-07, -3.2756e-08, -3.7622e-08,  ...,  8.1473e-09,
          1.2710e-07,  4.7741e-08],
        ...,
        [-5.2767e-04, -5.7762e-05, -6.6343e-05,  ...,  1.4367e-05,
          2.2413e-04,  8.4185e-05],
        [-2.4033e-07, -2.6309e-08, -3.0217e-08,  ...,  6.5436e-09,
          1.0208e-07,  3.8343e-08],
        [-2.8944e-06, -3.1685e-07, -3.6392e-07,  ...,  7.8808e-08,
          1.2294e-06,  4.6179e-07]])


In [21]:
param.grad.shape

torch.Size([500, 128])

# SkipGram

In [22]:
import time

import torch
import torch.nn  as nn
import torch.nn.functional as F

In [23]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128):
        super(SkipGram, self).__init__()
        self.emb_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.output_layer = nn.Linear(in_features=embedding_dim, out_features=vocab_size)

    def forward(self, x):
        """
        前向过程
        :param x: [N,1] long
        :return:
        """
        z1 = self.emb_layer(x)  # [N,1] --> [N,1,embedding_dim]
        z2 = z1[:, 0, :]  # [N,1,embedding_dim] --> [N,embedding_dim]
        scores = self.output_layer(z2)  # [N,embedding_dim] --> [N,vocab_size]  得到的是每个样本对应各个单词类别的置信度
        return scores

In [24]:
vocab_size = 50000  # 词汇表大小，也就是单词类别数目
batch_size = 16
window_size = 4  # 一个老师教多个学生

net = SkipGram(vocab_size=vocab_size, embedding_dim=128)

x = torch.randint(vocab_size, size=(batch_size, 1), dtype=torch.long)  # [N,1]
y = torch.randint(vocab_size, size=(batch_size, window_size), dtype=torch.long)  # [N,T]

In [25]:
print (y[0])
for i in range(window_size):
    y[0, i] = 2 + i
    y[1, i] = 0 + i
print (f"y shape is {y.shape}")
print (y[0])

tensor([43908, 20204, 45632, 14861])
y shape is torch.Size([16, 4])
tensor([2, 3, 4, 5])


In [26]:
scores = net(x)  # [N,vocab_size] N个样本每个样本在vocab_size个类别上的置信度
print(scores.shape)

torch.Size([16, 50000])


## BCEWithLogitsLoss

y降维

In [27]:
loss_fn = nn.BCEWithLogitsLoss()
y_onehot = F.one_hot(y, vocab_size).to(torch.float32)  # [N, T, vocab_size]
y_onehot_2 = torch.sum(y_onehot, dim=1)  # [N, T, vocab_size] -> [N, vocab_size]
loss = loss_fn(scores, y_onehot_2)   # [N, vocab_size]
print(loss) 

tensor(0.7330, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


## sigmoid

In [28]:
ori_prob = torch.sigmoid(scores)  # [N, vocab_size]
loss3 = y_onehot_2 * torch.log(ori_prob) + (1 - y_onehot_2) * torch.log(1.0 - ori_prob)
print(loss3[0].detach().numpy()[:20])
loss3 = -torch.mean(loss3)
print(loss3)

[-0.67824835 -0.91499376 -0.5686203  -0.79391515 -0.5762589  -0.5153297
 -0.7610896  -0.9129652  -0.4830457  -0.46847996 -0.68798566 -0.50904596
 -0.6055947  -0.72263414 -0.81640095 -1.5237985  -0.5609002  -0.48854607
 -0.7609143  -0.5639324 ]
tensor(0.7330, grad_fn=<NegBackward0>)


## unsqueeze & tile

x升维

In [29]:
prob = torch.unsqueeze(ori_prob, dim=1)  # [N, vocab_size] -> [N, 1, vocab_size]
prob = torch.tile(prob, [1, y.shape[1], 1])  # [N, 1, vocab_size] -> [N, T, vocab_size]
loss2 = y_onehot * torch.log(prob) + (1 - y_onehot) * torch.log(1.0 - prob)  # [N,T,vocab_size]
loss2 = torch.mean(loss2, dim=1)  # [N,T,vocab_size] -> [N, vocab_size]
print(loss2[0].detach().numpy()[:20])
loss2 = -torch.mean(loss2)
print(loss2)

[-0.67824835 -0.91499376 -0.7687175  -0.6496866  -0.7632118  -0.8110118
 -0.7610896  -0.9129652  -0.4830457  -0.46847996 -0.68798566 -0.50904596
 -0.6055947  -0.72263414 -0.81640095 -1.5237985  -0.5609002  -0.48854607
 -0.7609143  -0.5639324 ]
tensor(0.7330, grad_fn=<NegBackward0>)
