In [3]:
import math
import torch
import numpy as np

### 1. 数据预处理

In [4]:
# S: 起始标记
# E: 结束标记
# P: padding，将当前序列补齐至最长序列长度的占位符
sentence = [
    # enc_input dec_input dec_output
    ['ich mochte ein bier P','S i want a beer .', 'i want a beer . E'],
    ['ich mochte ein cola P','S i want a coke .', 'i want a coke . E'],
]

# 词典，padding用0来表示

# 源词典
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4, 'cola': 5}
src_vocab_size = len(src_vocab) # 6
# 目标词典（包含特殊符）
tgt_vocab = {'P':0,'i':1,'want':2,'a':3,'beer':4,'coke':5,'S':6,'E':7,'.':8}
# 反向映射词典，idx --> word
idx2word = {v: k for k, v in tgt_vocab.items()}
'''
{0: 'P',
 1: 'i',
 2: 'want',
 3: 'a',
 4: 'beer',
 5: 'coke',
 6: 'S',
 7: 'E',
 8: '.'}
'''
tgt_vocab_size = len(tgt_vocab) # 9

src_len = 5 # 输入序列enc_input的最长序列长度，其实就是最长的那句话的token数
tgt_len = 6 # 输出序列dec_input/dec_output的最长序列长度


In [4]:
sentence

[['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
 ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']]

In [5]:
src_vocab

{'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4, 'cola': 5}

In [6]:
tgt_vocab

{'P': 0,
 'i': 1,
 'want': 2,
 'a': 3,
 'beer': 4,
 'coke': 5,
 'S': 6,
 'E': 7,
 '.': 8}

In [5]:
# 这个函数把原始输入序列转换成token表示
def make_data(sentence):
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentence)):
        enc_input = [src_vocab[word] for word in sentence[i][0].split()]
        dec_input = [tgt_vocab[word] for word in sentence[i][1].split()]
        dec_output = [tgt_vocab[word] for word in sentence[i][2].split()]
        
        enc_inputs.append(enc_input)
        dec_inputs.append(dec_input)
        dec_outputs.append(dec_output)
    
    # LongTensor是专用于存储整型的，Tensor则可以存浮点、整数、bool等多种类型
    return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentence)

print(' enc_inputs: \n', enc_inputs)  # enc_inputs: [2,5]
print(' dec_inputs: \n', dec_inputs)  # dec_inputs: [2,6]
print(' dec_outputs: \n', dec_outputs) # dec_outputs: [2,6]

 enc_inputs: 
 tensor([[1, 2, 3, 4, 0],
        [1, 2, 3, 5, 0]])
 dec_inputs: 
 tensor([[6, 1, 2, 3, 4, 8],
        [6, 1, 2, 3, 5, 8]])
 dec_outputs: 
 tensor([[1, 2, 3, 4, 8, 7],
        [1, 2, 3, 5, 8, 7]])


In [6]:
# 使用Dataset加载数据
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_outputs):
        super(MyDataSet, self).__init__()
        self.enc_inputs = enc_inputs
        self.dec_inputs = dec_inputs
        self.dec_outputs = dec_outputs
        
    def __len__(self):
        # enc_inputs.shape = [2, 5], 返回的是2
        return self.enc_inputs.shape[0]
    
    def __getitem__(self, idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

# 使用DataLoader加载数据
loader = torch.utils.data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs), batch_size=2, shuffle=True)

### 2.模型参数

In [9]:
# 一个词的向量长度
d_model = 512

# FFN的隐藏层神经元个数
d_ff = 2048

# 多头注意力后的q, k, v词向量长度，这里是512/8=64
# 原文：queries and kes of dimention d_k,and values of dimension d_v .所以q和k的长度都用d_k来表示
d_k = d_v = 64

# Encoder Layer 和 Decoder Layer 的个数
n_layers = 6

# 多头注意力中head的个数，原文：we employ h = 8 parallel attention layers, or heads
n_heads = 8


- Transformer包含Encoder和Decoder
- Encoder和Decoder各自包含6个Layer
- Encoder Layer中包含 Self Attention 和 FFN 两个Sub Layer
- Decoder Layer中包含 Masked Self Attention、 Cross Attention、 FFN 三个Sub Layer

### 3. Positional Encoding

> 用于为输入的词向量进行位置编码

原文：The positional encodings have the same dimension d_model as the embeddings, so that the two can be summed

In [None]:
class PositionalEncoding(torch.nn.Module):
    def __init(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        # 开始位置编码部分,先生成一个max_len * d_model 的矩阵，即5000 * 512
        # 5000是一个句子中最多的token数，512是一个token用多长的向量来表示，5000*512这个矩阵用于表示一个句子的信息
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # pos：[max_len,1],即[5000,1]
        div_term = pos / pow(10000.0, torch.arange(0, d_model, 2).float() / d_model) # div_term：[5000,256]
        
        pe[:, 0::2] = torch.sin(div_term)
        pe[:, 1::2] = torch.cos(div_term)
        # 一个句子要做一次pe，一个batch中会有多个句子，所以增加一维用来和输入的一个batch的数据相加时做广播
        pe = pe.unsqueeze(0) # [5000,512] -> [1,5000,512] 
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        '''x: [batch_size, seq_len, d_model]'''
        # 5000是我们预定义的最大的seq_len，就是说我们把最多的情况pe都算好了，用的时候用多少就取多少
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)        

In [17]:
pe = torch.zeros(10, 6)

pos = torch.arange(0, 10, dtype=torch.float).unsqueeze(1)
div_term = pos / pow(10000.0, torch.arange(0, 6, 2).float() / 6) # div_term：[5000,256]
print(div_term)
pe[:, 0::2] = torch.sin(div_term)
# pe[:, 1::2] = torch.cos(div_term)
print(pe)

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 4.6416e-02, 2.1544e-03],
        [2.0000e+00, 9.2832e-02, 4.3089e-03],
        [3.0000e+00, 1.3925e-01, 6.4633e-03],
        [4.0000e+00, 1.8566e-01, 8.6177e-03],
        [5.0000e+00, 2.3208e-01, 1.0772e-02],
        [6.0000e+00, 2.7850e-01, 1.2927e-02],
        [7.0000e+00, 3.2491e-01, 1.5081e-02],
        [8.0000e+00, 3.7133e-01, 1.7235e-02],
        [9.0000e+00, 4.1774e-01, 1.9390e-02]])
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0000,  0.0464,  0.0000,  0.0022,  0.0000],
        [ 0.9093,  0.0000,  0.0927,  0.0000,  0.0043,  0.0000],
        [ 0.1411,  0.0000,  0.1388,  0.0000,  0.0065,  0.0000],
        [-0.7568,  0.0000,  0.1846,  0.0000,  0.0086,  0.0000],
        [-0.9589,  0.0000,  0.2300,  0.0000,  0.0108,  0.0000],
        [-0.2794,  0.0000,  0.2749,  0.0000,  0.0129,  0.0000],
        [ 0.6570,  0.0000,  0.3192,  0.0000,  0.0151,  0.0000],
        [ 0.9894,  0.0000, 

In [14]:
pow(10000.0, torch.arange(0, 6, 2).float() / 6)

tensor([  1.0000,  21.5443, 464.1590])