In [21]:
import torch
import torch.nn as nn
import math

from numpy.lib.utils import source
from torch.autograd import Variable
import copy
import torch.nn.functional as F
import numpy as np

In [22]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_size, dropout=0.1):
        super(Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.embedding(x))

In [23]:
vocab_size = 16
embed_size = 512
input = torch.LongTensor([[1,3,4,5],[2,4,5,6],[3,5,6,7]])
embedding = Embedding(vocab_size,embed_size)
embedding_res = embedding(input)
embedding_res.shape

torch.Size([3, 4, 512])

In [24]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()

        self.pe = torch.zeros(max_len, d_model)


        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))

        self.pe[:, 0::2] = torch.sin(positions * div_term)
        self.pe[:, 1::2] = torch.cos(positions * div_term)


        self.pe = self.pe.unsqueeze(0)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)

In [25]:
d_model = 512
positionalEncoding = PositionalEncoding(d_model)
pe_res = positionalEncoding(embedding_res)
query = key = value = pe_res

In [26]:
def attention(query, key, value, mask=None, dropout=None):

    d_k = query.size(1)
    source = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        source = source.masked_fill(mask == 0, -1e9)

    att = F.softmax(source, dim=-1)

    if dropout is not None:
        att = dropout(att)

    return torch.matmul(att, value)

def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
class MultiHeadAttention(nn.Module):
    def __init__(self, head, embed_size, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.head = head
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        assert d_model % head == 0
        self.d_k = embed_size // head

        self.linear = clones(nn.Linear(embed_size, embed_size), 4)
    def forward(self, query, key, value, mask=None, dropout=None):

        batch_size = query.size(0)
        if mask is not None:
            mask = mask.unsqueeze(0)
        query, key, value = [model(x).view(batch_size,-1,self.head,self.d_k).transpose(1,2) for model, x in zip(self.linear,(query, key, value))]

        x = attention(query, key, value, mask=mask, dropout=dropout)
        x = x.transpose(1,2).contiguous().view(batch_size, -1, self.head*self.d_k)

        return self.linear[-1](x)
head = 8
embed_size = 512
multiHeadAttention = MultiHeadAttention(head, embed_size, dropout=0.1)
mask = torch.zeros([8,4,4])
x = multiHeadAttention(query, key, value, mask)

In [27]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = self.w_2(self.dropout(F.relu(self.w_1(x))))
        return x

In [28]:
positionwiseFeedForward = PositionwiseFeedForward(512,512*4,dropout=0.1)
x = positionwiseFeedForward(x)

In [29]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [30]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        # 这里是将位置编码结果和注意力进行残差连接
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        # 然后再残差连接至前馈神经层
        return self.sublayer[1](x, self.feed_forward)

In [31]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = nn.LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            # 这里会进行n次连接
            x = layer(x, mask)
        return self.norm(x)

In [36]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, d_model, d_ff, head, N, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = Embedding(vocab_size, embed_size, dropout)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.encoder = Encoder(
            EncoderLayer(
                size=d_model,
                self_attn=MultiHeadAttention(head, d_model, dropout),
                feed_forward=PositionwiseFeedForward(d_model, d_ff, dropout),
                dropout=dropout
            ),
            N
        )

    def forward(self, src, mask):
        x = self.embedding(src)
        x = self.positional_encoding(x)
        x = self.encoder(x, mask)
        return x

vocab_size = 16
embed_size = 512
d_model = 512
d_ff = 2048
head = 8
N = 6  # 编码器层数
def make_mask(src, pad_token=0):
    mask = (src != pad_token).unsqueeze(-2)
    return mask
model = TransformerEncoder(vocab_size, embed_size, d_model, d_ff, head, N)

src = torch.LongTensor([[1, 3, 4, 5], [2, 4, 5, 6], [3, 5, 6, 7]])
mask = torch.zeros([8, 4, 4])
output = model(src, mask)
output.size(),output.shape

(torch.Size([3, 4, 512]), torch.Size([3, 4, 512]))