In [14]:
import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

# 关于word embedding，以序列建模为例
# 考虑source sentence和target sentence
# 构建序列，序列的字符以其在词表中的索引的形式表示
batch_size = 2

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_dim = 8

# 序列的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5
max_postion_len = 5

src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)

# 单词索引构成源句子和目标句子，并且做了padding，默认值为0
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len-L)), 0) \
                    for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len-L)), 0) \
                    for L in tgt_len])

# 构造embedding
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim) # max_num_src_words+1是因为进行了padding
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

# 构造position embedding
pos_mat = torch.arange(max_postion_len).reshape(-1, 1)
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape(1, -1)/model_dim)
pe_embedding_table = torch.zeros(max_postion_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat) # 偶数
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat) # 奇数
# print(pe_embedding_table)

pe_embedding = nn.Embedding(max_postion_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

# 应该传入的是源句子或目标句子中单词的位置信息，而不是源句子或目标句子中的单词索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

# 构造encoder的self-attention mask
# mask的shape：[batch_size, max_src_len, max_src_len]，值为1或-1e9
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) \
                                               for L in src_len]), 2)
valid_encoder_pos_matric = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matric = 1 - valid_encoder_pos_matric
mask_encoder_self_attention = invalid_encoder_pos_matric.to(torch.bool)

score = torch.randn(batch_size, max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)

print(mask_encoder_self_attention)
print(score)
print(masked_score)
print(prob)

tensor([[[False, False,  True,  True],
         [False, False,  True,  True],
         [ True,  True,  True,  True],
         [ True,  True,  True,  True]],

        [[False, False, False, False],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]])
tensor([[[ 0.0909, -2.3515, -0.5661,  0.0241],
         [ 0.5446, -0.6315, -1.2620,  0.4159],
         [ 1.6499,  0.7056, -0.8306, -0.2571],
         [ 0.3915, -0.3187, -0.2924, -2.7262]],

        [[ 0.3916, -1.0063,  0.7366,  0.0394],
         [-0.1752,  0.7153, -0.7634,  0.7525],
         [-0.5505,  0.6407, -0.3635,  0.9172],
         [ 0.2768,  0.2097,  1.0610, -1.1848]]])
tensor([[[ 9.0882e-02, -2.3515e+00, -1.0000e+09, -1.0000e+09],
         [ 5.4460e-01, -6.3148e-01, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[ 3.9160e-01, -1.0063e+00,  7.3659e-01,  

In [3]:
# softmax演示
alpha1 = 0.1
alpha2 = 10
score = torch.randn(5)
pro1 = F.softmax(score*alpha1, -1)
pro2 = F.softmax(score*alpha2, -1)
def softmax_func(score):
    return F.softmax(score)
jaco_mat1 = torch.autograd.functional.jacobian(softmax_func, score*alpha1)
jaco_mat2 = torch.autograd.functional.jacobian(softmax_func, score*alpha2)


print(jaco_mat1)
print(jaco_mat2)
print(score*alpha1)
print(score*alpha2)
print(pro1)
print(pro2)

tensor([0.0391, 0.0880, 0.1694, 0.0239, 0.1872])
tensor([ 3.9077,  8.7990, 16.9395,  2.3876, 18.7243])
tensor([0.1875, 0.1969, 0.2136, 0.1847, 0.2174])
tensor([3.1463e-07, 4.1889e-05, 1.4371e-01, 6.8807e-08, 8.5625e-01])


In [3]:
print(src_seq)
print(src_embedding_table.weight)
print(src_embedding)

tensor([[4, 2, 0, 0, 0],
        [7, 6, 2, 6, 0]])
Parameter containing:
tensor([[ 0.3002, -1.0248,  1.4143, -0.8677,  0.9682, -1.7056,  0.6163,  0.1819],
        [-1.0613,  0.2050,  0.5995,  0.6321, -0.4734, -0.4036, -0.6138, -0.9511],
        [-1.3335,  1.1304,  0.1107, -0.5590,  0.7528, -0.8151,  1.0783,  0.5061],
        [ 1.2633, -0.2112, -0.2402,  0.3951, -1.7609,  1.0837, -2.3911,  0.8222],
        [ 0.3705, -0.8774,  2.6517,  1.4675, -1.9497,  0.2747,  0.2293,  1.1696],
        [ 0.8290,  0.1008, -0.4555,  1.2346, -0.1740, -1.0024, -0.8419, -1.1522],
        [-0.0954, -0.4630,  1.3113,  0.0877,  1.1034,  1.2267, -1.0285, -0.2257],
        [ 1.4446,  0.8707,  1.6861,  0.1696, -0.8664, -1.1610,  0.4321, -0.0873],
        [-0.5472,  0.3052,  0.9246, -0.9573, -0.9849,  1.2678, -0.0442,  0.9531]],
       requires_grad=True)
tensor([[[ 0.3705, -0.8774,  2.6517,  1.4675, -1.9497,  0.2747,  0.2293,
           1.1696],
         [-1.3335,  1.1304,  0.1107, -0.5590,  0.7528, -0.8151,  1.0