In [1]:
"""
1. preprocessing data
2. build model
2.1 encoder
2.2 attention
2.3 decoder
3.evalution
3.1 given sentence, return translated result
3.2 visualize results(attention)
"""

'\n1. preprocessing data\n2. build model\n2.1 encoder\n2.2 attention\n2.3 decoder\n3.evalution\n3.1 given sentence, return translated result\n3.2 visualize results(attention)\n'

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0-beta1
sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.16.2
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0-beta1
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
en_spa_file_path = "./data/spa-eng/spa.txt"

import unicodedata
# 转成asc是为了减小词表
def unicode_to_ascii(s):
    # NFD 如果有一个unicode是多个asc组成的，就把这个拆开， Mn 重音
    return "".join(c for c in unicodedata.normalize("NFD",s) if unicodedata.category(c) != "Mn")

en_sentence = "Put it on"
sp_sentence = "Ponéoslo"
# 比如é 是一个e和一个重音符号，分开因为重音符号是Mn，所以忽略 所以é => e
print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

Put it on
Poneoslo


In [5]:
import re
def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    # []任意一个， （）替换操作, \1 本身
    # 标点符号前后加空格
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    # 多余空格变成一个空格
    s = re.sub(r'[" "]+', " ", s)
    # 除了标点符号和字母以外都是空格
    s = re.sub(r'[^a-zA-Z?.!,¿]+', " ", s)
    # 去掉前后空格
    s = s.rstrip().strip()
    s = '<start> ' + s + ' <end>'
    return s

def make_list(x,y):
    return [x,y]

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start> put it on <end>
<start> poneoslo <end>


In [6]:
def parse_data(filename):
    lines = open(filename, encoding="UTF-8").read().strip().split("\n")
    sentence_pairs = [line.split("\t")[0:-1] for line in lines]
    preprocessed_sentence_pairs = [ (preprocess_sentence(en),preprocess_sentence(sp)) for en, sp  in sentence_pairs]
    return zip(*preprocessed_sentence_pairs)
en_dataset, sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])

<start> it may be impossible to get a completely error free corpus due to the nature of this kind of collaborative effort . however , if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning , we might be able to minimize errors . <end>
<start> puede que sea imposible obtener un corpus completamente libre de errores debido a la naturaleza de este tipo de esfuerzo de colaboracion . sin embargo , si animamos a los miembros a contribuir frases en sus propios idiomas en lugar de experimentar con los idiomas que estan aprendiendo , podriamos ser capaces de minimizar los errores . <end>


In [7]:
# 补充
a = [(1,2), (3,4),(5,6)]
# 单星号能够将这个变量拆分成单个元素
print(*a)
# zip可以转置
c,d = zip(*a)
print(c,d)

(1, 2) (3, 4) (5, 6)
(1, 3, 5) (2, 4, 6)


In [8]:
# 文本式数据要被model读取要变成id式
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words = None, filters="", split=" "
    )
    
    lang_tokenizer.fit_on_texts(lang) 
    #序列的列表，列表中每个序列对应于一段输入文本 
    tensor = lang_tokenizer.texts_to_sequences(lang) #得到词索引[[1, 2, 3, 4], [1, 2, 3, 5]]
    tensor = keras.preprocessing.sequence.pad_sequences(tensor,
                                                   padding = "post")
    return tensor, lang_tokenizer
input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])


def max_length(tensor):
    return max(len(t) for t in tensor)

def len_test(tensor):
    return [len(t) for t in tensor[0:10]]

max_len_input = max_length(input_tensor)
max_len_output = max_length(output_tensor)

print(max_len_input, max_len_output)
print(len_test(input_tensor), len_test(output_tensor))
# dir(input_tokenizer)

16 11
[16, 16, 16, 16, 16, 16, 16, 16, 16, 16] [11, 11, 11, 11, 11, 11, 11, 11, 11, 11]


In [9]:
# 切分训练集和验证集
from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(input_tensor, 
                                                                      output_tensor, test_size=0.2)

len(input_train), len(input_eval), len(output_train), len(output_eval) 

(24000, 6000, 24000, 6000)

In [10]:
# 验证tokenizer是否起作用
def convert(example, tokenizer):
    for t in example:
        if t != 0:
            print("%d --> %s" %(t, tokenizer.index_word[t]))

convert(input_train[1], input_tokenizer)
print()
convert(output_train[1], output_tokenizer)

1 --> <start>
53 --> eres
39 --> muy
706 --> valiente
3 --> .
2 --> <end>

1 --> <start>
5 --> you
25 --> are
49 --> very
550 --> brave
3 --> .
2 --> <end>


In [11]:
def make_dataset(input_tensor, output_tensor, 
                 batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, 
                                                 output_tensor))
    if shuffle:
        dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size, 
                                           drop_remainder=True)
    return dataset

batch_size = 64
epochs = 20
    
train_dataset = make_dataset(input_train, output_train,
                             batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval,
                             batch_size, 1, False)

In [12]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x)
    print(y)

(64, 16)
(64, 11)
tf.Tensor(
[[  1   5  22 ...   0   0   0]
 [  1  53  39 ...   0   0   0]
 [  1   6 613 ...   0   0   0]
 ...
 [  1   8 604 ...   0   0   0]
 [  1  28 648 ...   0   0   0]
 [  1   5  16 ...   0   0   0]], shape=(64, 16), dtype=int32)
tf.Tensor(
[[   1   79   22    5   22    9    6    2    0    0    0]
 [   1    5   25   49  550    3    2    0    0    0    0]
 [   1    7   99   10   67  261    3    2    0    0    0]
 [   1  189   25   34 1142    3    2    0    0    0    0]
 [   1  124   55   13  761    3    2    0    0    0    0]
 [   1    7   23   12   74    5    3    2    0    0    0]
 [   1   31   76  187    3    2    0    0    0    0    0]
 [   1   40    5   43   47    6    2    0    0    0    0]
 [   1   79   40    7   22    9    6    2    0    0    0]
 [   1   79   22    5   95   39    6    2    0    0    0]
 [   1    7 1098   46    3    2    0    0    0    0    0]
 [   1   30   98    8   48    3    2    0    0    0    0]
 [   1    4   35 1975    3    2    0    0 

In [13]:
# 定义超参数
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index)+1
output_vocab_size = len(output_tokenizer.word_index)+1

print(input_vocab_size)
print(output_vocab_size)

9403
4834


In [14]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
    # 使用super().__init__()手动执行父类的构造方法, 
    # 不然会由于子类重写父类的__init__的方法导致父类在构造方法中定义的默认属性无法继承（不能使用）
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        # embedding将大型稀疏向量转换为保留语义关系的低维空间
        self.embedding = keras.layers.Embedding(vocab_size,
                                                embedding_units)
        self.gru = keras.layers.GRU(self.encoding_units, 
                                    return_sequences= True,
                                   return_state = True,
                                   recurrent_initializer="glorot_uniform")
    def call(self, x, hidden):
        # 函数式调用
        # a Layer instance is callable on a tensor , and returns a tensor 
        x = self.embedding(x)
        output, state = self.gru(x,initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))
encoder = Encoder(input_vocab_size, embedding_units,
                  units, batch_size )
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x, sample_hidden)

print("sample_output.shape:", sample_output.shape)
print("sample_hidden.shape:", sample_hidden.shape)

sample_output.shape: (64, 16, 1024)
sample_hidden.shape: (64, 1024)


In [15]:
class BahdanauAttention(keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self.decoder_hidden, encoder_outputs):
        # dencoder_hidden.shape: (batch_size, units)
        # encoder_outputs.shape: (batch_size, length, units)
        
        # before V: (batch_size, length, units)
        # after V: (batch_size, length, 1)
        # tf.expand_dims:在指定索引出增加一维度，值为1，从索引0开始
        # axis: 取值范围是[-阶数，阶数]，二维的时候0指的是列，1指的是行，
        decoder_hidden_with_time_axis = tf.expend_dims(decoder_hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden_with_time_axis)))
        
        # shape: (batch_size, length, 1)
        attention_weights = tf.nn.softmax(score, axis = 1)
        
        # 加权
        # context_vector.shape: (batch_size, length, units)
        context_vector = attention_weights * encoder_outputs
        
        # 平均  在length 维度上求和
        #  context_vector.shape: (batch_size, units)
        context_vector = tf.reduce_sum(context_vector, axis = 1)

SyntaxError: invalid syntax (<ipython-input-15-83c2746ad144>, line 8)