# seq2seq模型——机器翻译

高级改进：

beamsearch

## 环境依赖

In [1]:
import unicodedata
import string
import re
import random
import time
import math
import jieba
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## 数据预处理

In [2]:
USE_CUDA = torch.cuda.is_available()

In [3]:
print('USE_CUDA: %s' % USE_CUDA)

USE_CUDA: True


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
print('device: %s' % device)

device: cuda


In [6]:
SEGMENTATION = False    # 是否分词

### 文本预处理

丢弃除了中文、字母和常用标点之外的符号。

In [7]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z\u4e00-\u9fa5.!?，。？]+", r" ", s)
    return s

### 构建词表

引入三个特殊的Token:

1. `SOS`, "Start of sentence”，标识句子开始
2. `EOS`, “End of sentence”，表示句子结束
3. `UNK`, "Unknown Token"，标识未登录词

In [8]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang(object):
    """
    词表Vocabulary.
    """

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", '2': 'UNK'}
        self.n_words = 3 # Count SOS and EOS
      
    def index_words(self, sentence):
        if self.name == 'cn':
            words = list(jieba.cut(sentence)) if SEGMENTATION else sentence    
            for word in words:
                self.index_word(word)
        else:
            words = sentence.split(' ')
            for word in words:
                self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

读取平行语料，并进行清理。

In [9]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

### 准备数据集

样例为了加快训练，只保留了不长于10个单词的句对，真正实验中将更多数据考虑进来可能获得更好的效果。

In [10]:
MAX_LENGTH = 10

def filter_pair(p):
    return len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

处理数据的全过程：

- 读取数据，每一行分别处理，将其转换成句对
- 对于文本进行处理，过滤无用符号
- 根据已有文本对于单词进行编号，构建符号到编号的映射


In [11]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('cn', 'eng', False)

# Print an example pair
print(random.choice(pairs))

Reading lines...
Read 90000 sentence pairs
Trimmed to 68898 sentence pairs
Indexing words...
['汤姆不尊重玛丽的观点。', 'tom doesn t respect mary s opinion .']


从数据集中sample出200条数据作为验证集

In [12]:
def sample_test_dataset(size=100):

    with open('cn-eng-test.txt', 'w+') as f:
        f.write('\n'.join(['\t'.join(pair) for pair in random.sample(pairs, k=size)]))

In [13]:
# sample_test_dataset()

### 将文本数据转换为张量

为了训练，我们需要将句子变成神经网络可以理解的东西（数字）。每个句子将被分解成单词，然后变成张量，其中每个单词都被索引替换（来自之前的Lang索引）。在创建这些张量时，我们还将附加EOS令牌以表示该句子已结束。

![](https://i.imgur.com/LzocpGH.png)

In [14]:
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    """
    根据词表，将句子转化成索引列表。

    :reutrn list，e.g. [1, 2, 3, 4]
    """
    if lang.name == 'cn':
        words = list(jieba.cut(sentence)) if SEGMENTATION else sentence
        return [lang.word2index[word] if word in lang.word2index else UNK_token for word in words ]
    else:
        words = sentence.split(' ')
        return [lang.word2index[word] if word in lang.word2index else UNK_token for word in words]

def variable_from_sentence(lang, sentence):
    """
    将句子转换成Tensor.
    
    :return Tensor, shape(n, 1)
    """
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    var = torch.LongTensor(indexes).view(-1, 1)
    if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(pair):
    """
    将平行语料对转化成Tensors.
    
    :return (input_tensor, output_tensor)
    """
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [15]:
pair = random.choice(pairs)
print('pair: %s' % pair)

input_tensor, target_tensor = variables_from_pair(pair)
print('input_tensor shape: %s, output_tensor shap: %s' % (input_tensor.shape, target_tensor.shape))
print('input_tensor: %s' % input_tensor)

pair: ['我在这里闻见了什么。', 'i smell something here .']
input_tensor shape: torch.Size([11, 1]), output_tensor shap: torch.Size([6, 1])
input_tensor: tensor([[   3],
        [  15],
        [ 138],
        [  70],
        [1854],
        [ 742],
        [  12],
        [  17],
        [ 255],
        [  13],
        [   1]], device='cuda:0')


## 模型

### 编码器

In [16]:
class EncoderGRU(nn.Module):
    """GRU 编码器"""

    def __init__(self, input_size, hidden_size, n_layers=1, bidirectional=False):
        """
        初始化
        :param input_size, 输入词表大
        :param hidden_size, Embedding维度大小，RNN hidden大小
        :param n_layers, RNN层数
        """
        super(EncoderGRU, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        # 用GRU替换RNN
        # self.rnn = nn.RNN(hidden_size, hidden_size, n_layers)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
        
    def forward(self, word_inputs, hidden):
        """
        前向传播
        :param word_inputs, 输入序列 shape(n, 1)
        :param hidden, 隐层 shape(seq_len*n_layers, batch_size, hidden_size)
        :return output(seq_len, batch, num_directions*hidden_size),
                hidden(num_layers*num_directions, hidden_size)
        """
        # Note: we run this all at once (over the whole input sequence)
        seq_len = len(word_inputs)
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        num_directions = 2 if self.bidirectional else 1
        hidden = torch.zeros(self.n_layers*num_directions, 1, self.hidden_size)
        if USE_CUDA: hidden = hidden.cuda()
        return hidden

### 解码器

In [17]:
class DecoderGRU(nn.Module):
    """GRU 解码器"""

    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(DecoderGRU, self).__init__()
        
        # Keep parameters for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)

        # 使用GRU替换RNN
        # self.rnn = nn.RNN(hidden_size, hidden_size, n_layers, dropout=dropout_p)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, word_input, last_hidden):
        # Note: we run this one step at a time        
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        rnn_output, hidden = self.rnn(word_embedded, last_hidden)

        rnn_output = rnn_output.squeeze(0)
        output = F.log_softmax(self.out(rnn_output))

        return output, hidden

### 加载模型

In [18]:
encoder = torch.load('model_storage/nmt-seq2seq-bigru-encoder.pkl')
decoder = torch.load('model_storage/nmt-seq2seq-bigru-decoder.pkl')

## 模型验证

In [19]:
def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = variable_from_sentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    
    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Create starting vectors for decoder
    decoder_input = torch.LongTensor([[SOS_token]]) # SOS
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    decoder_hidden = encoder_hidden.view(encoder.n_layers, encoder_hidden.shape[1], -1)
    
    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('EOS')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.item()])
            
        # Next input is chosen word
        decoder_input = torch.LongTensor([[ni]])
        if USE_CUDA: decoder_input = decoder_input.cuda()
    
    return decoded_words

In [20]:
import queue

class BeamSearchNode(object):
    """Beamsearch Node"""
    
    def __init__(self, decoder_input, decoder_hidden, score, decode_words=None):
        self.decoder_input = decoder_input
        self.decoder_hidden = decoder_hidden
        self.decode_words = decode_words or []
        self.score = score
        
    def __str__(self):
        return 'BeamNode score: %s, words: %s' % (self.score, ' '.join(self.decode_words))



def evaluate_bs(sentence, max_length=MAX_LENGTH,  bs_size=2, debug=False):
    """使用beamsearch提高预测能力."""
    
    # 队列缓存BeamSeach Node
    q = queue.Queue()

    input_variable = variable_from_sentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    
    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Create starting vectors for decoder
    decoder_input = torch.LongTensor([[SOS_token]]) # SOS
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    decoder_hidden = encoder_hidden.view(encoder.n_layers, encoder_hidden.shape[1], -1)
    
    decoder_attentions = torch.zeros(max_length, max_length)

    # First Node
    bs_node = BeamSearchNode(decoder_input, decoder_hidden, 1.0)
    q.put(bs_node)
    
    # Run through decoder
    for di in range(max_length):
        if q.empty():
            break

        # 候选集
        candidates = []
        while not q.empty():
            bs_node = q.get()
        
            if bs_node.decode_words and bs_node.decode_words[-1] == 'EOS':
                candidates.append(bs_node)
                continue

            decoder_output, decoder_hidden = decoder(bs_node.decoder_input, bs_node.decoder_hidden)

            # Choose top word from output
            # topv: top values
            # topi: top value indices
            topv, topi = decoder_output.data.topk(bs_size)
            for i in range(bs_size):
                ni = topi[0][i]
                score = abs(topv[0][i].item())   # Note: 解码器输出的是log_softmax
                decode_word = output_lang.index2word[ni.item()]
                decoder_input = torch.LongTensor([[ni]])
                if USE_CUDA:
                    decoder_input = decoder_input.cuda()

                if debug:
                    print('score: %.6f, words: %s' % (
                        bs_node.score * score, ' '.join(bs_node.decode_words + [decode_word])))

                candidates.append(BeamSearchNode(decoder_input, 
                                                 decoder_hidden,
                                                 bs_node.score*score,
                                                 bs_node.decode_words[:] + [decode_word]))

        if debug: print('\n')
        # 排序
        candidates = sorted(candidates, key=lambda x: x.score)
        for i in range(bs_size):
            q.put(candidates[i])

    return candidates[0].decode_words

In [34]:
print(' '.join(evaluate_bs('我的爸爸只有十五岁。')))
print(' '.join(evaluate_bs('我们的爸爸只有十五岁。')))

my father is only fifteen years old . EOS
our father is only ten years old . EOS




In [22]:
print(' '.join(evaluate('我的爸爸只有十五岁。')))
print(' '.join(evaluate('我们的爸爸只有十五岁。')))

my father is only fifteen . EOS
our only is is just . EOS




In [23]:
' '.join(evaluate_bs('我们的爸爸只有十五岁。', debug=True))

score: 0.051014, words: our
score: 3.799890, words: my


score: 0.068516, words: our only
score: 0.117402, words: our father
score: 2.397962, words: my father
score: 8.895734, words: my only


score: 0.059149, words: our only is
score: 0.216661, words: our only only
score: 0.029143, words: our father is
score: 0.374103, words: our father has


score: 0.017970, words: our father is only
score: 0.075150, words: our father is just
score: 0.079870, words: our only is is
score: 0.132378, words: our only is only


score: 0.047692, words: our father is only ten
score: 0.061728, words: our father is only one
score: 0.206803, words: our father is just ten
score: 0.209009, words: our father is just years


score: 0.040267, words: our father is only ten years
score: 0.121811, words: our father is only ten .
score: 0.118071, words: our father is only one years
score: 0.240102, words: our father is only one of


score: 0.015129, words: our father is only ten years .
score: 0.093163, words: our fath



'our father is only ten years old . EOS'

随机选取一个句子进行验证。

In [24]:
def evaluate_randomly():
    pair = random.choice(pairs)
    
    output_words = evaluate(pair[0])
    output_sentence = ' '.join(output_words)
    
    print('>', pair[0])
    print('=', pair[1])
    print('<', output_sentence)
    print('')

In [25]:
evaluate_randomly()

> 生活多么复杂 
= life is so complicated .
< life is so nice ! EOS





In [26]:
print(' '.join(evaluate('人生是有趣的。')))
print(' '.join(evaluate_bs('人生是有趣的。')))


life is fun . EOS
life is fun . EOS




In [27]:
import collections
from torchtext.data.metrics import bleu_score


# 读取测试数据集
with open('cn-eng-test.txt') as f:
    lines = f.read().strip().split('\n')
    
    test_pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]



test_pairs_dict = collections.defaultdict(lambda : [])

for pair in test_pairs:
    test_pairs_dict[pair[0]].append(pair[1].split(' '))


In [28]:
def evaluate_bleu_score():
    candicates = []
    references = []

    for i, pair in enumerate(test_pairs_dict.items(), start=1):
        candicate = evaluate(pair[0])
        if candicate[-1] == 'EOS':
            candicate.pop(-1)
        candicates.append(candicate)
        references.append(pair[1])
    
    score = bleu_score(candicates, references)
    return score


def evaluate_bs_bleu_score(bs_size=5):
    candicates = []
    references = []

    for i, pair in enumerate(test_pairs_dict.items(), start=1):
        candicate = evaluate_bs(pair[0], bs_size=bs_size)
        if candicate[-1] == 'EOS':
            candicate.pop(-1)
        candicates.append(candicate)
        references.append(pair[1])
    
    score = bleu_score(candicates, references)
    return score

In [29]:
print('test dataset bleu score: %s' % evaluate_bleu_score())



test dataset bleu score: 0.4404138090273204


In [30]:
print('test dataset beam search bleu score: %s' % evaluate_bs_bleu_score(bs_size=2))



test dataset beam search bleu score: 0.4699782729148865


使用beamsearch预测，BLEU值略有提高。

## 预测

In [32]:
def predict():
    """预测"""
    with open('test.txt') as f:
        sentences =  [line.strip() for line in f.readlines()]

    output_sentences = []
    for sentence in sentences:
        output_sentence = ' '.join(evaluate_bs(sentence))
        output_sentences.append(output_sentence.strip('EOS'))
    
    with open('result-bigru.txt', 'w') as f:
        f.write('\n'.join(output_sentences))

In [33]:
predict()

