In [1]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense


# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.8.0


In [7]:
import numpy as np
import time
import tensorflow as tf

with open("./data/letters_source.txt", 'r', encoding='utf-8') as f:
    source_data = f.read()
with open("./data/letters_target.txt", 'r', encoding='utf-8') as f:
    target_data = f.read()

In [9]:
source_data.split("\n")[:10]
target_data.split("\n")[:10]

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

In [13]:
# 数据预处理
def extract_character_vocab(data):
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    set_words = list(set([character for line in data.split('\n') for character in line]))
    int_to_vocab = {idx:word for idx,word in enumerate(special_words + set_words)}
    vocab_to_int = {word:idx for idx,word in int_to_vocab.items()}
    
    return int_to_vocab,vocab_to_int

In [14]:
# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

# 字母数字转换
source_int = [[source_letter_to_int.get(letter, source_letter_to_int.get('<UNK>')) for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int.get('<UNK>')) for letter in line] + [target_letter_to_int.get('<EOS>')] for line in target_data.split('\n')]


In [16]:
target_int

[[20, 6, 29, 29, 25, 3],
 [19, 4, 26, 3],
 [6, 22, 17, 7, 27, 3],
 [6, 29, 9, 3],
 [20, 10, 23, 17, 3],
 [20, 28, 28, 15, 24, 3],
 [28, 16, 21, 22, 4, 4, 13, 3],
 [19, 4, 25, 9, 3],
 [8, 17, 18, 7, 11, 3],
 [14, 23, 17, 15, 29, 3],
 [16, 10, 12, 3],
 [9, 3],
 [17, 4, 3],
 [20, 6, 16, 19, 3],
 [20, 14, 23, 17, 3],
 [20, 8, 8, 24, 3],
 [5, 3],
 [23, 25, 25, 24, 11, 11, 3],
 [17, 29, 9, 3],
 [16, 23, 19, 29, 24, 11, 3],
 [16, 21, 22, 4, 11, 3],
 [21, 17, 3],
 [6, 23, 25, 24, 26, 3],
 [5, 23, 9, 3],
 [28, 3],
 [8, 8, 10, 13, 3],
 [8, 23, 23, 27, 3],
 [22, 27, 3],
 [14, 14, 17, 29, 3],
 [22, 17, 17, 19, 18, 13, 3],
 [5, 19, 29, 24, 3],
 [20, 3],
 [21, 12, 27, 3],
 [6, 4, 29, 12, 27, 3],
 [14, 28, 17, 18, 24, 9, 26, 3],
 [16, 16, 18, 12, 27, 26, 3],
 [28, 8, 17, 3],
 [16, 8, 23, 17, 19, 11, 11, 3],
 [4, 3],
 [28, 23, 17, 27, 3],
 [4, 3],
 [6, 3],
 [14, 10, 23, 3],
 [20, 18, 4, 25, 24, 24, 3],
 [14, 8, 8, 22, 22, 19, 3],
 [14, 10, 17, 19, 18, 29, 11, 3],
 [15, 19, 7, 3],
 [20, 10, 10, 22, 18,

In [17]:
# 构建模型
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    # 定义target序列最大长度
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_sequence_lgenght')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

In [25]:
# Encoder
def get_encoder_layer(input_data, rnn_size, num_layers, 
                      source_sequence_length, source_vocab_size, 
                      encoding_embedding_size):
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, 
                                                          source_vocab_size,
                                                          encoding_embedding_size)
    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input,
                                                    sequence_length=source_sequence_length,dtype=tf.float32)
    return encoder_output, encoder_state

In [31]:
# decoder
# cut  <PAD> 和 <EOS>
# 添加 <GO>
def process_decoder_input(target, vocab_to_int, batch_size):
    ending = tf.strided_slice(target, [0,0], [batch_size, -1], [1,1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return decoder_input

In [41]:
# 对数据进行Embedding
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers,
                  rnn_size, target_sequence_length, max_target_sequence_length,
                  encoder_state, decoder_input):
    # 1 embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    # 2 构造decode rnn
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
    
    # 3 output 接全连接层
    output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    # 4 Training decoder
    with tf.variable_scope('decode'):
        # 训练helper
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                           sequence_length=target_sequence_length,
                                                           time_major=False)
        # 训练decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell, 
                                                           training_helper,
                                                           encoder_state, 
                                                           output_layer)
        
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                      impute_finished=True,
                                                                      maximum_iterations=max_target_sequence_length)
    # 5 预测 decoder , 与training 共享参数
    with tf.variable_scope('decode', reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')
        # 预测的helper
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                    start_tokens,
                                                                    target_letter_to_int['<EOS>'])
        # 预测decoder
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                               predicting_helper,
                                                               encoder_state,
                                                               output_layer
                                                              )
        predicting_decoder_output,_,_ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iterations=max_target_sequence_length)
        return training_decoder_output, predicting_decoder_output

In [42]:
# seqseq encode拼接decoder
def seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_sequence_length,
                 source_sequence_length, source_vocab_size, target_vocab_size,
                 encoder_embedding_size, decoder_embedding_size,
                 rnn_size, num_layers):
    _, encoder_state = get_encoder_layer(input_data,rnn_size,num_layers,
                                        source_sequence_length,
                                        source_vocab_size,
                                        encoding_embedding_size)
    
    # 预处理 decoder 输入
    decoder_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    # 将状态向量与输入传递给decoder
    training_decoder_output, predicting_decoder_output = decoding_layer(target_letter_to_int,
                                                                       decoding_embedding_size,
                                                                       num_layers,
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       encoder_state,
                                                                       decoder_input)
    return training_decoder_output, predicting_decoder_output

In [43]:
# 超参数
epochs = 60
batch_size = 128
rnn_size = 50
num_layers = 2
encoding_embedding_size = 15
decoding_embedding_size = 15
learning_rate = 0.001

In [46]:
train_graph = tf.Graph()
with train_graph.as_default():
    # 模型输入
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data,
                                                                      targets,
                                                                      lr,
                                                                      target_sequence_length,
                                                                      max_target_sequence_length,
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoding_embedding_size,
                                                                      decoding_embedding_size,
                                                                      rnn_size,
                                                                      num_layers)
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32,name='masks')
    
    with tf.name_scope('optimization'):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
        
        optimizer = tf.train.AdamOptimizer(lr)
        
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_norm(grad, 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [47]:
# batches
def pad_sentense_batch(sentence_batch, pad_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int]*(max_sentence - len(sentence)) for sentence in sentence_batch]

In [52]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i*batch_size
        source_batch = sources[start_i:start_i+batch_size]
        target_batch = targets[start_i:start_i+batch_size]
        # 序列补齐
        pad_source_batch = np.array(pad_sentense_batch(source_batch, source_pad_int))
        pad_target_batch = np.array(pad_sentense_batch(target_batch, target_pad_int))
        
        target_length = []
        for target in target_batch:
            target_length.append(len(target))
        
        source_length = []
        for source in source_batch:
            source_length.append(len(source))
        
        yield pad_target_batch, pad_source_batch, target_length, source_length
            

In [53]:
# taind
# 将数据集分割为train和validation
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]
# 留出一个batch进行验证
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]

(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))
display_step = 50 # 每隔50轮输出loss

checkpoint = "trained_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
        
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>'])):
            
            _, loss = sess.run(
                [train_op, cost],
                {input_data: sources_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths})

            if batch_i % display_step == 0:
                
                # 计算validation loss
                validation_loss = sess.run(
                [cost],
                {input_data: valid_sources_batch,
                 targets: valid_targets_batch,
                 lr: learning_rate,
                 target_sequence_length: valid_targets_lengths,
                 source_sequence_length: valid_sources_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              validation_loss[0]))

    
    
    # 保存模型
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

Epoch   1/60 Batch    0/77 - Training Loss:  3.398  - Validation loss:  3.395
Epoch   1/60 Batch   50/77 - Training Loss:  2.924  - Validation loss:  2.913
Epoch   2/60 Batch    0/77 - Training Loss:  2.574  - Validation loss:  2.579
Epoch   2/60 Batch   50/77 - Training Loss:  2.210  - Validation loss:  2.215
Epoch   3/60 Batch    0/77 - Training Loss:  2.034  - Validation loss:  2.042
Epoch   3/60 Batch   50/77 - Training Loss:  1.842  - Validation loss:  1.825
Epoch   4/60 Batch    0/77 - Training Loss:  1.702  - Validation loss:  1.731
Epoch   4/60 Batch   50/77 - Training Loss:  1.606  - Validation loss:  1.576
Epoch   5/60 Batch    0/77 - Training Loss:  1.481  - Validation loss:  1.514
Epoch   5/60 Batch   50/77 - Training Loss:  1.425  - Validation loss:  1.392
Epoch   6/60 Batch    0/77 - Training Loss:  1.301  - Validation loss:  1.340
Epoch   6/60 Batch   50/77 - Training Loss:  1.271  - Validation loss:  1.232
Epoch   7/60 Batch    0/77 - Training Loss:  1.148  - Validation

Epoch  54/60 Batch    0/77 - Training Loss:  0.006  - Validation loss:  0.013
Epoch  54/60 Batch   50/77 - Training Loss:  0.007  - Validation loss:  0.012
Epoch  55/60 Batch    0/77 - Training Loss:  0.006  - Validation loss:  0.012
Epoch  55/60 Batch   50/77 - Training Loss:  0.006  - Validation loss:  0.011
Epoch  56/60 Batch    0/77 - Training Loss:  0.006  - Validation loss:  0.012
Epoch  56/60 Batch   50/77 - Training Loss:  0.006  - Validation loss:  0.011
Epoch  57/60 Batch    0/77 - Training Loss:  0.005  - Validation loss:  0.012
Epoch  57/60 Batch   50/77 - Training Loss:  0.005  - Validation loss:  0.011
Epoch  58/60 Batch    0/77 - Training Loss:  0.005  - Validation loss:  0.011
Epoch  58/60 Batch   50/77 - Training Loss:  0.005  - Validation loss:  0.011
Epoch  59/60 Batch    0/77 - Training Loss:  0.005  - Validation loss:  0.011
Epoch  59/60 Batch   50/77 - Training Loss:  0.005  - Validation loss:  0.010
Epoch  60/60 Batch    0/77 - Training Loss:  0.005  - Validation

ValueError: Parent directory of trained_model.ckpt doesn't exist, can't save.