## 环境依赖

In [1]:
import unicodedata
import string
import re
import random
import time
import math
import jieba
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

## 数据预处理

In [2]:
USE_CUDA = torch.cuda.is_available()

In [3]:
print('USE_CUDA: %s' % USE_CUDA)

USE_CUDA: True


In [4]:
SEGMENTATION = True    # 是否分词

### 文本预处理

丢弃除了中文、字母和常用标点之外的符号。

In [5]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z\u4e00-\u9fa5.!?，。？]+", r" ", s)
    return s

### 构建词表

引入三个特殊的Token:

1. `SOS`, "Start of sentence”，标识句子开始
2. `EOS`, “End of sentence”，表示句子结束
3. `UNK`, "Unknown Token"，标识未登录词

In [6]:
SOS_token = 0
EOS_token = 1
UNK_token = 2

class Lang(object):
    """
    词表Vocabulary.
    """

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", '2': 'UNK'}
        self.n_words = 3 # Count SOS and EOS
      
    def index_words(self, sentence):
        if self.name == 'cn':
            words = list(jieba.cut(sentence)) if SEGMENTATION else sentence    
            for word in words:
                self.index_word(word)
        else:
            words = sentence.split(' ')
            for word in words:
                self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

读取平行语料，并进行清理。

In [7]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

### 准备数据集

样例为了加快训练，只保留了不长于10个单词的句对，真正实验中将更多数据考虑进来可能获得更好的效果。

In [8]:
MAX_LENGTH = 10

def filter_pair(p):
    return len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

处理数据的全过程：

- 读取数据，每一行分别处理，将其转换成句对
- 对于文本进行处理，过滤无用符号
- 根据已有文本对于单词进行编号，构建符号到编号的映射


In [9]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('cn', 'eng', False)

# Print an example pair
print(random.choice(pairs))

Reading lines...


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


Read 90000 sentence pairs
Trimmed to 68898 sentence pairs
Indexing words...


Loading model cost 0.440 seconds.
Prefix dict has been built succesfully.


['你在嚇我。', 'you re frightening me .']


从数据集中sample出200条数据作为验证集

In [10]:
def sample_test_dataset(size=100):

    with open('cn-eng-test.txt', 'w+') as f:
        f.write('\n'.join(['\t'.join(pair) for pair in random.sample(pairs, k=size)]))

In [11]:
# sample_test_dataset()

### 将文本数据转换为张量

为了训练，我们需要将句子变成神经网络可以理解的东西（数字）。每个句子将被分解成单词，然后变成张量，其中每个单词都被索引替换（来自之前的Lang索引）。在创建这些张量时，我们还将附加EOS令牌以表示该句子已结束。

![](https://i.imgur.com/LzocpGH.png)

In [12]:
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    """
    根据词表，将句子转化成索引列表。

    :reutrn list，e.g. [1, 2, 3, 4]
    """
    if lang.name == 'cn':
        words = list(jieba.cut(sentence)) if SEGMENTATION else sentence
        return [lang.word2index[word] if word in lang.word2index else UNK_token for word in words ]
    else:
        words = sentence.split(' ')
        return [lang.word2index[word] if word in lang.word2index else UNK_token for word in words]

def variable_from_sentence(lang, sentence):
    """
    将句子转换成Tensor.
    
    :return Tensor, shape(n, 1)
    """
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    var = torch.LongTensor(indexes).view(-1, 1)
    if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(pair):
    """
    将平行语料对转化成Tensors.
    
    :return (input_tensor, output_tensor)
    """
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [13]:
pair = random.choice(pairs)
print('pair: %s' % pair)

input_tensor, target_tensor = variables_from_pair(pair)
print('input_tensor shape: %s, output_tensor shap: %s' % (input_tensor.shape, target_tensor.shape))
print('input_tensor: %s' % input_tensor)

pair: ['他累了，但是他繼續工作。', 'he was tired but he kept working .']
input_tensor shape: torch.Size([10, 1]), output_tensor shap: torch.Size([9, 1])
input_tensor: tensor([[ 118],
        [ 453],
        [  11],
        [  21],
        [1801],
        [ 118],
        [3907],
        [ 595],
        [  12],
        [   1]], device='cuda:0')
