In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
batch_size = 2
num_src_words = 8
num_tgt_words = 8
model_dim = 8

# Step1: word embedding
src_len = torch.tensor([2, 4])
tgt_len = torch.tensor([4, 3])

src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(2, num_src_words, (L, )), (0, max(src_len) - L)), 0 )for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(2, num_tgt_words, (L, )), (0, max(tgt_len) - L)), 0 )for L in tgt_len])

src_embedding_table = nn.Embedding(num_src_words + 1, model_dim)
tgt_embedding_table = nn.Embedding(num_tgt_words + 1, model_dim)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = src_embedding_table(tgt_seq)

src_embedding, tgt_embedding

(tensor([[[ 0.7942, -0.2543,  0.1098, -0.2321, -1.0910, -0.2508, -0.8620,
            1.6607],
          [ 1.4593,  0.6327,  0.1173, -0.5725, -0.4923, -1.3177,  0.9695,
            0.2554],
          [ 0.0973, -0.1601,  1.3130,  0.7643, -0.5781, -0.1826, -0.7810,
            1.6035],
          [ 0.0973, -0.1601,  1.3130,  0.7643, -0.5781, -0.1826, -0.7810,
            1.6035]],
 
         [[ 0.9817, -0.2296, -0.2355, -0.9257,  0.9638,  1.6550, -0.6574,
           -0.3325],
          [ 0.7942, -0.2543,  0.1098, -0.2321, -1.0910, -0.2508, -0.8620,
            1.6607],
          [ 0.6115,  1.4670,  0.4795,  0.5715, -0.4108, -0.1104, -0.5819,
           -0.7797],
          [ 1.4593,  0.6327,  0.1173, -0.5725, -0.4923, -1.3177,  0.9695,
            0.2554]]], grad_fn=<EmbeddingBackward0>),
 tensor([[[ 0.6115,  1.4670,  0.4795,  0.5715, -0.4108, -0.1104, -0.5819,
           -0.7797],
          [ 0.7942, -0.2543,  0.1098, -0.2321, -1.0910, -0.2508, -0.8620,
            1.6607],
          [ 1.

In [3]:
# Step2:position embedding
max_position_len = 5
pos_mat = torch.arange(max_position_len).reshape((-1, 1))
i_mat = torch.arange(0, model_dim, 2).reshape((1, -1)) / model_dim
i_mat = torch.pow(10000, i_mat)
pe_embedding_table = torch.zeros(max_position_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0) for L in src_len])
src_pe_embedding = pe_embedding(src_pos)
src_pe_embedding

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]

In [27]:
# Step3:mask encoder attention
# valid_encoder_pos / valid_encoder_pos_matrix / 
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len) - L)), 0) for L in src_len]), 2)
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1 - valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)

score = torch.randn( batch_size,max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)
prob

tensor([[[0.0878, 0.9122, 0.0000, 0.0000],
         [0.1845, 0.8155, 0.0000, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500],
         [0.2500, 0.2500, 0.2500, 0.2500]],

        [[0.0332, 0.1706, 0.5612, 0.2349],
         [0.3350, 0.2386, 0.3562, 0.0702],
         [0.4325, 0.3008, 0.1594, 0.1074],
         [0.3641, 0.0343, 0.1105, 0.4911]]])

In [33]:
# Step4:构造intra-attention的mask
# Q @ K^T  [batch_size, tgt_len, src_len]
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len) - L)), 0) for L in src_len]), 2)
valid_decoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(tgt_len) - L)), 0) for L in tgt_len]), 2)
valid_cross_pos_matrix = torch.bmm(valid_decoder_pos, valid_encoder_pos.transpose(1, 2))
invalid_cross_pos_matrix = 1 - valid_cross_pos_matrix
mask_cross_attention = invalid_cross_pos_matrix.to(torch.bool)

In [57]:
# 构造decoder self-attention的mask
# Q @ K^T
valid_decoder_tri_matrix = torch.cat([torch.unsqueeze(F.pad(torch.tril(torch.ones(L, L)), (0, max(tgt_len) - L, 0, max(tgt_len) - L)), 0) for L in tgt_len])
invalid_decoder_tri_matrix = 1 - valid_decoder_tri_matrix
invalid_decoder_tri_matrix = invalid_decoder_tri_matrix.to(torch.bool)
score = torch.randn(batch_size, max(tgt_len), max(tgt_len))
masked_score = score.masked_fill(invalid_decoder_tri_matrix, -1e9)
prob = F.softmax(masked_score, -1)
prob

tensor([[[1.0000, 0.0000, 0.0000, 0.0000],
         [0.9043, 0.0957, 0.0000, 0.0000],
         [0.3807, 0.1550, 0.4643, 0.0000],
         [0.1101, 0.4788, 0.1303, 0.2808]],

        [[1.0000, 0.0000, 0.0000, 0.0000],
         [0.3866, 0.6134, 0.0000, 0.0000],
         [0.3449, 0.5049, 0.1502, 0.0000],
         [0.2500, 0.2500, 0.2500, 0.2500]]])

In [None]:
def scaled_dot_product(Q, K, V, atten_mask):
    score = torch.bmm(Q, K.transpose(-2, -1)) / torch.sqrt(model_dim)
    masked_score = score.masked_fill(atten_mask, -1e9)
    prob = F.softmax(masked_score, -1)
    context = torch.bmm(prob, V)
    return context