In [1]:
import random
import torch
from torch import nn

In [2]:
# 构建语料库，包含中文，解码器输入英文，翻译后的目标输出英文
corpus = [
    ['咖哥 喜欢 小冰', '<sos> KaGe likes XiaoBing', 'KaGe likes XiaoBing <eos>'],
    ['我 爱 学习 人工智能', '<sos> I love studying AI', 'I love studying AI <eos>'],
    ['深度学习 改变 世界', '<sos> DL changed the world', 'DL changed the world <eos>'],
    ['自然 语言 处理 很 强大', '<sos> NLP is so powerful', 'NLP is so powerful <eos>'],
    ['神将网络 非常 复杂', '<sos> Neural-Nets are complex', 'Neural-Nets are complex <eos>']
]
print('语料库句子数量：', len(corpus))

语料库句子数量： 5


In [3]:
# 构建中英词汇表
word_vocabulary_cn = []
word_vocabulary_en = []

for sentence in corpus:
    for token in sentence[0].split():
        if token not in word_vocabulary_cn:
            word_vocabulary_cn.append(token)
    
    for token in sentence[1].split():
        if token not in word_vocabulary_en:
            word_vocabulary_en.append(token)

    for token in sentence[2].split():
        if token not in word_vocabulary_en:
            word_vocabulary_en.append(token)

len_word_vocabulary_cn = len(word_vocabulary_cn)
len_word_vocabulary_en = len(word_vocabulary_en)
print('中文词汇表词汇数量：', len_word_vocabulary_cn)
print('英文词汇表词汇数量：', len_word_vocabulary_cn)

中文词汇表词汇数量： 18
英文词汇表词汇数量： 18


In [4]:
# 构建中英字典词表
word_vocabulary_cn_to_index_vocabulary = {token : index for index, token in enumerate(word_vocabulary_cn)}
word_vocabulary_en_to_index_vocabulary = {token : index for index, token in enumerate(word_vocabulary_en)}
index_vocabulary_to_word_vocabulary_cn = {index : token for index, token in enumerate(word_vocabulary_cn)}
index_vocabulary_to_word_vocabulary_en = {index : token for index, token in enumerate(word_vocabulary_en)}
print('中文词汇索引表', word_vocabulary_cn_to_index_vocabulary)
print('英文词汇索引表', word_vocabulary_en_to_index_vocabulary)
print('索引中文词汇表', index_vocabulary_to_word_vocabulary_cn)
print('索引英文词汇表', index_vocabulary_to_word_vocabulary_en)

中文词汇索引表 {'咖哥': 0, '喜欢': 1, '小冰': 2, '我': 3, '爱': 4, '学习': 5, '人工智能': 6, '深度学习': 7, '改变': 8, '世界': 9, '自然': 10, '语言': 11, '处理': 12, '很': 13, '强大': 14, '神将网络': 15, '非常': 16, '复杂': 17}
英文词汇索引表 {'<sos>': 0, 'KaGe': 1, 'likes': 2, 'XiaoBing': 3, '<eos>': 4, 'I': 5, 'love': 6, 'studying': 7, 'AI': 8, 'DL': 9, 'changed': 10, 'the': 11, 'world': 12, 'NLP': 13, 'is': 14, 'so': 15, 'powerful': 16, 'Neural-Nets': 17, 'are': 18, 'complex': 19}
索引中文词汇表 {0: '咖哥', 1: '喜欢', 2: '小冰', 3: '我', 4: '爱', 5: '学习', 6: '人工智能', 7: '深度学习', 8: '改变', 9: '世界', 10: '自然', 11: '语言', 12: '处理', 13: '很', 14: '强大', 15: '神将网络', 16: '非常', 17: '复杂'}
索引英文词汇表 {0: '<sos>', 1: 'KaGe', 2: 'likes', 3: 'XiaoBing', 4: '<eos>', 5: 'I', 6: 'love', 7: 'studying', 8: 'AI', 9: 'DL', 10: 'changed', 11: 'the', 12: 'world', 13: 'NLP', 14: 'is', 15: 'so', 16: 'powerful', 17: 'Neural-Nets', 18: 'are', 19: 'complex'}


In [5]:
# 生成训练数据
def make_train_data(corpus):
    sentence = random.choice(corpus)
    encoder_input = torch.tensor([[word_vocabulary_cn_to_index_vocabulary[token] for token in sentence[0].split()]])
    decoder_input = torch.tensor([[word_vocabulary_en_to_index_vocabulary[token] for token in sentence[1].split()]])
    target = torch.tensor([[word_vocabulary_en_to_index_vocabulary[token] for token in sentence[2].split()]])
    return encoder_input, decoder_input, target

# 验证
encoder_input, decoder_input, target = make_train_data(corpus)
print('编码器输入:', encoder_input)
print('编码器输入形状：', encoder_input.size())
print('解码器输入:', encoder_input)
print('解码器输入形状：', decoder_input.size())
print('目标:', encoder_input)
print('目标形状：', target.size())
for sentence in corpus:
    if all(word_vocabulary_cn_to_index_vocabulary[token] in encoder_input for token in sentence[0].split()):
    # all(iterable)判断可迭代对象所有元素是否满足某个条件，如果所有元素都为True或等价于True的值，则返回True，否则返回False
    # 如果可迭代对象为空，则返回True，没有元素违反条件
    # 短路操作，一旦遇到第一个False，就会立即返回False
        original_sentence = sentence
        break
print('原始句子：', original_sentence)

编码器输入: tensor([[7, 8, 9]])
编码器输入形状： torch.Size([1, 3])
解码器输入: tensor([[7, 8, 9]])
解码器输入形状： torch.Size([1, 5])
目标: tensor([[7, 8, 9]])
目标形状： torch.Size([1, 5])
原始句子： ['深度学习 改变 世界', '<sos> DL changed the world', 'DL changed the world <eos>']


In [6]:
# 构建编码器和解码器架构
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)

    def forward(self, inputs, hidden):
        output, hidden = self.rnn(self.embedding(inputs), hidden)
        return output, hidden
    
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, inputs, hidden):
        output, hidden = self.rnn(self.embedding(inputs), hidden)
        output = self.fc(output)
        return output, hidden

In [7]:
hidden_size = 128
encoder = Encoder(len_word_vocabulary_cn, hidden_size)
decoder = Decoder(len_word_vocabulary_en, hidden_size)
print('编码器结构：\n', encoder)
print('解码器结构：\n', decoder)

编码器结构：
 Encoder(
  (embedding): Embedding(18, 128)
  (rnn): RNN(128, 128, batch_first=True)
)
解码器结构：
 Decoder(
  (embedding): Embedding(20, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=20, bias=True)
)


In [8]:
# 构建编码器-解码器架构
class Seq2Seq(nn.Module):
    def __init__(self, Encoder, Decoder):
        super().__init__()
        self.Encoder = Encoder
        self.Decoder = Decoder

    def forward(self, encoder_input, hidden, decoder_input):
        _, encoder_hidden = self.Encoder(encoder_input, hidden)
        decoder_hidden = encoder_hidden
        decoder_output, _ = self.Decoder(decoder_input, decoder_hidden)
        return decoder_output

In [9]:
model = Seq2Seq(encoder, decoder)
print('编码器-解码器架构：\n', model)

编码器-解码器架构：
 Seq2Seq(
  (Encoder): Encoder(
    (embedding): Embedding(18, 128)
    (rnn): RNN(128, 128, batch_first=True)
  )
  (Decoder): Decoder(
    (embedding): Embedding(20, 128)
    (rnn): RNN(128, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=20, bias=True)
  )
)


In [10]:
# 模型训练
def train(model, criterion, optimizer, epochs):
    for epoch in range(epochs):
        encoder_input, decoder_input, target = make_train_data(corpus)
        hidden = torch.zeros(1, encoder_input.size(0), hidden_size)
        optimizer.zero_grad()
        output = model(encoder_input, hidden, decoder_input)
        loss = criterion(output.view(-1, len_word_vocabulary_en), target.view(-1))
        if (epoch + 1) % 40 == 0:
            print(f'Epoch：{epoch + 1} cost = {loss:.6f}')
        loss.backward()
        optimizer.step()

In [11]:
epochs = 400
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train(model, criterion, optimizer, epochs)

Epoch：40 cost = 0.241193
Epoch：80 cost = 0.064824
Epoch：120 cost = 0.032962
Epoch：160 cost = 0.035060
Epoch：200 cost = 0.019002
Epoch：240 cost = 0.015831
Epoch：280 cost = 0.011321
Epoch：320 cost = 0.009127
Epoch：360 cost = 0.008043
Epoch：400 cost = 0.005759


In [12]:
def text(model, text_sentence):
    encoder_input = torch.tensor([[word_vocabulary_cn_to_index_vocabulary[token] for token in text_sentence.split()]])
    decoder_input = torch.tensor([word_vocabulary_en_to_index_vocabulary['<sos>']] +
                                  [word_vocabulary_en_to_index_vocabulary['<eos>']] *
                                  (len(encoder_input[0] - 1)))
    decoder_input = decoder_input.unsqueeze(0)
    hidden = torch.zeros(1, encoder_input.size(0), hidden_size)
    predict = model(encoder_input, hidden, decoder_input)
    predict = predict.max(2, keepdim=True)[1]
    print(text_sentence, '--->', [index_vocabulary_to_word_vocabulary_en[n.item()] for n in predict.squeeze()])

In [13]:
text(model, '咖哥 喜欢 小冰')

咖哥 喜欢 小冰 ---> ['KaGe', 'likes', 'XiaoBing', '<eos>']


In [14]:
text(model, '自然 语言 处理 很 强大')

自然 语言 处理 很 强大 ---> ['NLP', 'is', 'so', '<eos>', '<eos>', '<eos>']
