In [54]:
import torch
import numpy as np
import math

In [55]:
class Embedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, dropout=0.1):
        super(Embedding,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.dropout = torch.nn.Dropout(p=dropout)
        self.lut = torch.nn.Embedding(vocab_size, embed_size)

    def forward(self,x):
        x = self.lut(x)
        x = self.dropout(x)
        return x * math.sqrt(self.embed_size)
embedding = Embedding(16,512)
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
res = embedding(input)

In [56]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        # 先创建一个全零矩阵
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        # 用sin cos来填充原来的pos矩阵
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        # 这样我们就得到了位置编码矩阵pe, pe现在还只是一个二维矩阵，要想和embedding的输出（一个三维张量）相加，
        # 就必须拓展一个维度，所以这里使用unsqueeze拓展维度.
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)


    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)
res1 = PositionalEncoding(512, 0.2)
res2 = res1(res)

In [57]:
def subsequent_mask(size):
    """生成向后遮掩的掩码张量, 参数size是掩码张量最后两个维度的大小, 它的最后两维形成一个方阵"""
    # 在函数中, 首先定义掩码张量的形状
    attn_shape = (1, size, size)

    # 然后使用np.ones方法向这个形状中添加1元素,形成上三角阵, 最后为了节约空间,
    # 再使其中的数据类型变为无符号8位整形unit8
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')

    # 最后将numpy类型转化为torch中的tensor, 内部做一个1 - 的操作,
    # 在这个其实是做了一个三角阵的反转, subsequent_mask中的每个元素都会被1减,
    # 如果是0, subsequent_mask中的该位置由0变成1
    # 如果是1, subsequent_mask中的该位置由1变成0
    return torch.from_numpy(1 - subsequent_mask)
mask = subsequent_mask(5)

In [58]:
def attention(query, key, value, mask=None, dropout=None):

    dk = query.size(1)

    scores = torch.matmul(query,key.transpose(-2, -1)) / math.sqrt(dk)

    if mask is not None:
        #这里为了方便保证后面的softmax等于0 需要将掩码的位置设置为一个-无穷
        scores = scores.masked_fill(mask == 0, -1e9)

    att = torch.nn.functional.softmax(scores, dim=-1)

    if dropout is not None:
        att = dropout(att)

    print(att.shape,value.shape)

    return torch.matmul(att, value)

query = key = value = res2

mask = torch.zeros(2, 4, 4)
attn = attention(query, key, value,mask=mask)
print("attn:", attn)

torch.Size([2, 4, 4]) torch.Size([2, 4, 512])
attn: tensor([[[ -5.6566,   3.1390,  -3.0825,  ...,  -6.0775,  -8.8436,  -1.9522],
         [ -5.6566,   3.1390,  -3.0825,  ...,  -6.0775,  -8.8436,  -1.9522],
         [ -5.6566,   3.1390,  -3.0825,  ...,  -6.0775,  -8.8436,  -1.9522],
         [ -5.6566,   3.1390,  -3.0825,  ...,  -6.0775,  -8.8436,  -1.9522]],

        [[  5.9229, -20.5859,  -0.6315,  ...,   7.3740, -10.7739,  13.3334],
         [  5.9229, -20.5859,  -0.6315,  ...,   7.3740, -10.7739,  13.3334],
         [  5.9229, -20.5859,  -0.6315,  ...,   7.3740, -10.7739,  13.3334],
         [  5.9229, -20.5859,  -0.6315,  ...,   7.3740, -10.7739,  13.3334]]],
       grad_fn=<UnsafeViewBackward0>)


In [68]:
import copy

def clone_linear(linear,size):
    return torch.nn.ModuleList([copy.deepcopy(linear) for _ in range(size)])
class MutiHeadAttention(torch.nn.Module):
    def __init__(self,heads,embed_size,dropout=0.1):
        super(MutiHeadAttention, self).__init__()
        self.heads = heads
        self.embed_size = embed_size

        assert embed_size % heads == 0
        self.d_k = embed_size // heads

        self.linear = clone_linear(torch.nn.Linear(embed_size,embed_size),4)

        self.attn = None

        self.dropout = torch.nn.Dropout(p=dropout)
    def forward(self,query,key,value,mask=None):
        if mask is not None:
            mask = mask.unsqueeze(0)
        # 样本个数
        batch_size = query.size(0)
        # 切割多头，然后矩阵变换到，切割的次数
        # 这里只所有需要进行seq,self.heads的转，是因为，heads其实并不重要，seq可以放在后面，这样一seq和dk就可以在一个头中矩阵相乘
        # 论文中的解释就是效果好
        # 将Q K V 扔到model的线性层
        query,key,value = [model(x).view(batch_size,-1,self.heads,self.d_k).transpose(1,2)  for model,x in zip(self.linear,(query,key,value))]
        # 注意力机制
        x = attention(query,key,value,mask,self.dropout)


        # 逆向操作，回到三维张量
        x = x.transpose(1,2).contiguous().view(batch_size,-1,self.heads * self.d_k)


        # 这里再使用最后一个线性变换方法进行线性变换
        return  self.linear[-1](x)


In [71]:
head = 8
embedding = 512
query = key = value = res2
mutiHeadAttention = MutiHeadAttention(heads=head,embed_size=embedding,dropout=0.1)
mask = torch.zeros([8,4,4])
x = mutiHeadAttention(query,key,value,mask=mask)

torch.Size([2, 8, 4, 4]) torch.Size([2, 8, 4, 64])


torch.Size([2, 4, 512])

In [75]:
class PositionWiseFeedForward(torch.nn.Module):
    def __init__(self,d_model,d_ff,dropout=0.1):
        super(PositionWiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = torch.nn.Linear(d_model,d_ff)
        self.fc2 = torch.nn.Linear(d_ff,d_model)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.relu = torch.nn.ReLU()

    def forward(self,x):
        x = self.relu(self.fc1(x))
        x = self.fc2(self.dropout(x))
        return x
positionWiseFeedForward = PositionWiseFeedForward(512,512*4,dropout=0.1)
wise_res = positionWiseFeedForward(x)

In [80]:
class LayerNorm(torch.nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super(LayerNorm,self).__init__()

        # 定义一个a b 参数并需要训练
        self.a_1 = torch.nn.Parameter(torch.ones(d_model))
        self.b_1 = torch.nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return  self.a_1 * (x - mean) / (std + self.eps) + self.b_1

features = d_model = 512
eps = 1e-6
layerNorm = LayerNorm(features)
x = layerNorm(wise_res)

tensor([[[ 0.2281, -0.6313,  0.4876,  ...,  0.3830,  0.0751,  1.7192],
         [ 0.0059,  0.8614, -0.3627,  ..., -0.1831,  0.5181,  0.0025],
         [ 0.2073, -0.4178,  0.2634,  ...,  0.4646, -0.8108,  1.4696],
         [-0.3626,  0.3036,  0.5932,  ...,  0.5363, -0.1390,  0.3032]],

        [[-0.5965, -0.8341, -0.3698,  ...,  0.6870, -0.2825,  1.7752],
         [-0.5176, -0.6484,  0.1244,  ...,  0.6189,  0.4491,  1.5456],
         [-0.1680, -1.9283,  0.1998,  ...,  0.4091, -0.1784,  1.4479],
         [-0.5876, -0.5583, -0.2078,  ..., -0.2927, -0.5400,  1.8870]]],
       grad_fn=<AddBackward0>)