In [65]:
import torch
import numpy as np
import math

In [66]:
class Embedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, dropout=0.1):
        super(Embedding,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.dropout = torch.nn.Dropout(p=dropout)
        self.lut = torch.nn.Embedding(vocab_size, embed_size)

    def forward(self,x):
        x = self.lut(x)
        x = self.dropout(x)
        return x * math.sqrt(self.embed_size)
embedding = Embedding(16,512)
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
res = embedding(input)

In [67]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        # 先创建一个全零矩阵
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        # 用sin cos来填充原来的pos矩阵
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        # 这样我们就得到了位置编码矩阵pe, pe现在还只是一个二维矩阵，要想和embedding的输出（一个三维张量）相加，
        # 就必须拓展一个维度，所以这里使用unsqueeze拓展维度.
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)


    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)
res1 = PositionalEncoding(512, 0.2)
res2 = res1(res)

In [68]:
def subsequent_mask(size):
    """生成向后遮掩的掩码张量, 参数size是掩码张量最后两个维度的大小, 它的最后两维形成一个方阵"""
    # 在函数中, 首先定义掩码张量的形状
    attn_shape = (1, size, size)

    # 然后使用np.ones方法向这个形状中添加1元素,形成上三角阵, 最后为了节约空间,
    # 再使其中的数据类型变为无符号8位整形unit8
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')

    # 最后将numpy类型转化为torch中的tensor, 内部做一个1 - 的操作,
    # 在这个其实是做了一个三角阵的反转, subsequent_mask中的每个元素都会被1减,
    # 如果是0, subsequent_mask中的该位置由0变成1
    # 如果是1, subsequent_mask中的该位置由1变成0
    return torch.from_numpy(1 - subsequent_mask)
mask = subsequent_mask(5)

In [73]:
def attention(query, key, value, mask=None, dropout=None):

    dk = query.size(1)

    scores = torch.matmul(query,key.transpose(-2, -1)) / math.sqrt(dk)

    if mask is not None:
        #这里为了方便保证后面的softmax等于0 需要将掩码的位置设置为一个-无穷
        scores = scores.masked_fill(mask == 0, -1e9)
    att = torch.nn.functional.softmax(scores, dim=-1)

    if dropout is not None:
        att = dropout(att)

    print(att.shape,value.shape)
    return torch.matmul(att, value)
query = key = value = res2

mask = torch.zeros(2, 4, 4)
attn = attention(query, key, value,mask=mask)
print("attn:", attn)

torch.Size([2, 4, 4]) torch.Size([2, 4, 512])
attn: tensor([[[  1.6945,  -6.0764,  18.8077,  ...,   2.0801,  12.3604,  15.9042],
         [  1.6945,  -6.0764,  18.8077,  ...,   2.0801,  12.3604,  15.9042],
         [  1.6945,  -6.0764,  18.8077,  ...,   2.0801,  12.3604,  15.9042],
         [  1.6945,  -6.0764,  18.8077,  ...,   2.0801,  12.3604,  15.9042]],

        [[-12.3425,  10.2197,  -2.4821,  ..., -13.9962,   0.9311,  26.9661],
         [-12.3425,  10.2197,  -2.4821,  ..., -13.9962,   0.9311,  26.9661],
         [-12.3425,  10.2197,  -2.4821,  ..., -13.9962,   0.9311,  26.9661],
         [-12.3425,  10.2197,  -2.4821,  ..., -13.9962,   0.9311,  26.9661]]],
       grad_fn=<UnsafeViewBackward0>)


RuntimeError: The size of tensor a (5) must match the size of tensor b (4) at non-singleton dimension 2