In [1]:
import tensorflow as tf


  return f(*args, **kwds)


In [2]:
def read_file(filename):
    with tf.gfile.Open(filename) as f:
        return [x.strip() for x in f]
    
def extract_char_vocab(filedata):
    special_words = ['<PAD>', '<UNK>',  '<GO>', '<EOS>']
    s = list(set([x for line in filedata for x in line]))
    index2char = {index:chr for index, chr in enumerate(special_words + s)}
    char2index = {chr:index for index, chr in index2char.items()}
    return char2index, index2char
source_filedata = read_file("data/letters_source.txt")
print(source_filedata[:5])
source_char2index, source_index2char = extract_char_vocab(source_filedata)
target_filedata= read_file("data/letters_target.txt")
print(target_filedata[:5])
target_char2index, target_index2char = extract_char_vocab(target_filedata)

print(source_char2index)
print(target_char2index)

['bsaqq', 'npy', 'lbwuj', 'bqv', 'kial']
['abqqs', 'npy', 'bjluw', 'bqv', 'aikl']
{'<PAD>': 0, '<UNK>': 1, '<GO>': 2, '<EOS>': 3, 'q': 4, 'd': 5, 'e': 6, 'z': 7, 'k': 8, 'c': 9, 'r': 10, 'l': 11, 'b': 12, 'y': 13, 'x': 14, 'j': 15, 'w': 16, 'a': 17, 'v': 18, 'p': 19, 'n': 20, 'f': 21, 'h': 22, 's': 23, 'u': 24, 't': 25, 'g': 26, 'o': 27, 'i': 28, 'm': 29}
{'<PAD>': 0, '<UNK>': 1, '<GO>': 2, '<EOS>': 3, 'q': 4, 'd': 5, 'e': 6, 'z': 7, 'k': 8, 'c': 9, 'r': 10, 'l': 11, 'b': 12, 'y': 13, 'x': 14, 'j': 15, 'w': 16, 'a': 17, 'v': 18, 'p': 19, 'n': 20, 'f': 21, 'h': 22, 's': 23, 'u': 24, 't': 25, 'g': 26, 'o': 27, 'i': 28, 'm': 29}


In [3]:
# 将输入变化为数字符号
def input_2_ids(inputdata, vocab_char2index):
    return [[vocab_char2index.get(chr, vocab_char2index.get("<UNK>")) for chr in sentence] for sentence in inputdata]
source_input_ids = input_2_ids(source_filedata, source_char2index)
target_input_ids = input_2_ids(target_filedata, target_char2index)
print(source_input_ids[:5])
print(target_input_ids[:5])

[[12, 23, 17, 4, 4], [20, 19, 13], [11, 12, 16, 24, 15], [12, 4, 18], [8, 28, 17, 11]]
[[17, 12, 4, 4, 23], [20, 19, 13], [12, 15, 11, 24, 16], [12, 4, 18], [17, 28, 8, 11]]


In [4]:
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = "input")
    targets = tf.placeholder(tf.int32, [None, None], name = "target")
    target_sequence_length = tf.placeholder(tf.int32, [None], name = "target_sequence_length")
    max_target_length = tf.reduce_max(target_sequence_length, name = "max_target_len")
    source_sequence_length = tf.placeholder(tf.int32, [None], name = "source_sequence_length")
    max_source_length = tf.reduce_max(target_sequence_length, name = "max_source_len")
    return inputs, targets, target_sequence_length, max_target_length, source_sequence_length, max_source_length

In [5]:
def get_encoder_layer(inputdata, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoder_embedding_size):
    input_embedding = tf.contrib.layers.embed_sequence(inputdata, vocab_size = source_vocab_size, embed_dim = encoder_embedding_size)
    rnn_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for x in range(num_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(rnn_cell, input_embedding, sequence_length = source_sequence_length, dtype=tf.float32 )
    return encoder_output, encoder_state

In [6]:
def process_target_input(targetdata, vocab_to_int, batch_size):
    no_ending = tf.strided_slice(targetdata, [0,0], [batch_size, -1], [1,1])
    decode_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int["<GO>"]), no_ending], 1)
    return decode_input

In [7]:
def decoding_layer(target_char2index, decoding_embedding_size, rnn_size, num_layers, target_sequence_length, max_target_length, 
                   encoder_state, decoder_input):
    # 1. embedding 
    target_vocab_size = len(target_char2index)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    
    # 2. 构造Decoder中的rnn
    rnn_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for i in range(num_layers)])
    
    # 3. Output 全连接层
    output_layer = tf.layers.Dense(target_vocab_size, kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev = 0.1))
    
    # 4. Training 
    with tf.variable_scope("decode"):
        # helper
        training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_embed_input, sequence_length = target_sequence_length)
        # 构造decoder
        train_decode = tf.contrib.seq2seq.BasicDecoder(rnn_cell, training_helper, encoder_state, output_layer)
        training_decode_output, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decode, impute_finished = True,maximum_iterations=max_target_length)
        
    with tf.variable_scope("decode", reuse = True):
        start_tokens = tf.tile(tf.constant([target_char2index['<GO>']], dtype = tf.int32), [batch_size], name = "start_tokens")
        predict_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, target_char2index["<EOS>"])
        predict_decode = tf.contrib.seq2seq.BasicDecoder(rnn_cell, predict_helper, encoder_state, output_layer)
        predict_decode_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predict_decode, impute_finished = True, maximum_iterations=max_target_length)
        
    return training_decode_output, predict_decode_output


In [8]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_length, source_sequence_length, 
                  source_vocab_size, target_vocab_size, encoder_embedding_size, decoder_embedding_size, 
                 rnn_size, num_layers):
    _, encoder_state = get_encoder_layer(input_data, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoder_embedding_size)
    decoder_input = process_target_input(targets, target_char2index, batch_size)
    training_decode_output, predict_decode_output = decoding_layer(target_char2index, decoder_embedding_size, rnn_size, num_layers, 
                                                                   target_sequence_length, max_target_length, 
                                                                   encoder_state, decoder_input)
    return training_decode_output, predict_decode_output

In [9]:
# 超参数
# Number of Epochs
epochs = 60
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 15
decoding_embedding_size = 15
# Learning Rate
learning_rate = 0.001

In [10]:
train_graph = tf.Graph()
with train_graph.as_default():
    inputs, targets, target_sequence_length, max_target_length, source_sequence_length, max_source_length = get_inputs()
    training_decoder_output, predicting_decoder_output = seq2seq_model(inputs, targets, None, target_sequence_length, max_target_length, 
                                                                  source_sequence_length, len(source_char2index), len(target_char2index),
                                                                 encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers)
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_length, dtype=tf.float32, name='masks')
    with tf.name_scope("optimization"):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
        
        optimizer = tf.train.AdamOptimizer(0.05)
        
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
        


In [11]:
def pad_sentence_batch(sentence_batch, pad_int):
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence_len - len(sentence)) for sentence in sentence_batch]

def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    for batch_i in range(0, len(sources)// batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i : start_i + batch_size]
        targets_batch = targets[start_i : start_i + batch_size]
        pad_sources_batch = pad_sentence_batch(sources_batch, source_pad_int)
        pad_targets_batch = pad_sentence_batch(targets_batch, target_pad_int)
        
        targets_lengths = []
        for target in targets_batch:
            targets_lengths.append(len(target))
        
        sources_lengths = []
        for source in sources_batch:
            sources_lengths.append(len(source))
        
        yield pad_sources_batch, sources_lengths, pad_targets_batch, targets_lengths
        

In [12]:
train_source = source_input_ids[batch_size:]
train_target = target_input_ids[batch_size:]
valid_source = source_input_ids[:batch_size]
valid_target = target_input_ids[:batch_size]

(valid_sources_batch, valid_sources_lengths, valid_targets_batch, valid_targets_lengths) = next(get_batches(
    valid_target, valid_source, batch_size, source_char2index["<PAD>"], target_char2index["<PAD>"]))
display_step = 50

check_point = "./abc.ckpt"
with tf.Session(graph = train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epochs + 1):
        for batch_i, (source_batch, source_lengths, target_batch, target_lengths) in enumerate(
            get_batches(train_target, train_source, batch_size, source_char2index["<PAD>"], target_char2index["<PAD>"])):
            _, loss = sess.run([train_op, cost], feed_dict = {
                inputs:source_batch,
                targets:target_batch,
                target_sequence_length : target_lengths,
                source_sequence_length : source_lengths
            })
            
            if batch_i % display_step == 0:
                valid_loss = sess.run([cost], feed_dict = {
                    inputs: valid_sources_batch,
                    targets: valid_targets_batch,
                    target_sequence_length : valid_targets_lengths,
                    source_sequence_length : valid_sources_lengths
                })
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training loss:{:>6.3f} - Valid loss:{:>6.3f}'
                     .format(epoch_i, epochs,
                            batch_i, len(train_source) // batch_size,
                            loss, valid_loss[0]))
            
    
    saver = tf.train.Saver()
    saver.save(sess, check_point)
    print('Model Trained and Saved')
    
        


Epoch   1/60 Batch    0/77 - Training loss: 3.403 - Valid loss: 3.293
Epoch   1/60 Batch   50/77 - Training loss: 1.962 - Valid loss: 1.933
Epoch   2/60 Batch    0/77 - Training loss: 1.430 - Valid loss: 1.406
Epoch   2/60 Batch   50/77 - Training loss: 0.699 - Valid loss: 0.638
Epoch   3/60 Batch    0/77 - Training loss: 0.460 - Valid loss: 0.428
Epoch   3/60 Batch   50/77 - Training loss: 0.302 - Valid loss: 0.292
Epoch   4/60 Batch    0/77 - Training loss: 0.230 - Valid loss: 0.230
Epoch   4/60 Batch   50/77 - Training loss: 0.194 - Valid loss: 0.171
Epoch   5/60 Batch    0/77 - Training loss: 0.116 - Valid loss: 0.117
Epoch   5/60 Batch   50/77 - Training loss: 0.116 - Valid loss: 0.101
Epoch   6/60 Batch    0/77 - Training loss: 0.112 - Valid loss: 0.110
Epoch   6/60 Batch   50/77 - Training loss: 0.079 - Valid loss: 0.073
Epoch   7/60 Batch    0/77 - Training loss: 0.065 - Valid loss: 0.072
Epoch   7/60 Batch   50/77 - Training loss: 0.066 - Valid loss: 0.065
Epoch   8/60 Batch  

Epoch  60/60 Batch    0/77 - Training loss: 0.022 - Valid loss: 0.053
Epoch  60/60 Batch   50/77 - Training loss: 0.010 - Valid loss: 0.036
Model Trained and Saved


In [13]:
# predict
def source_to_seq(text):
    seq_length = 7
    return [source_char2index.get(c, source_char2index.get("<UNK>")) for c in text] + [source_char2index.get("<PAD>") for i in range(seq_length - len(text))]

def predict_sequece(word):
    input_word = word
    text = source_to_seq(input_word)
    loaded_graph = tf.Graph()
    with tf.Session(graph = loaded_graph) as sess:
        loader = tf.train.import_meta_graph(check_point + '.meta')
        loader.restore(sess, check_point)
        inputdata = [text] * batch_size
        
        input_node = loaded_graph.get_tensor_by_name("input:0")
        logits = loaded_graph.get_tensor_by_name("predictions:0")
        source_seq_length = loaded_graph.get_tensor_by_name("source_sequence_length:0")
        target_seq_length = loaded_graph.get_tensor_by_name("target_sequence_length:0")
        
        answer = sess.run(logits, feed_dict = {input_node :inputdata,
                                       source_seq_length : [len(word)] * batch_size,
                                                target_seq_length: [len(word)] * batch_size})[0]
        
        print("原始输入：", word)
        print("\nSource")
        print(" word 编号：{}".format([x for x in text]))
        print(" Input word:{}".format([source_index2char.get(x) for x in text]))
        
        print("\nTarget")
        print(" word 编号：{}".format([x for x in answer]))
        print(" output word:{}".format([target_index2char.get(x) for x in answer]))
predict_sequece("common")

INFO:tensorflow:Restoring parameters from ./abc.ckpt
原始输入： common

Source
 word 编号：[9, 27, 29, 29, 27, 20, 0]
 Input word:['c', 'o', 'm', 'm', 'o', 'n', '<PAD>']

Target
 word 编号：[9, 29, 29, 20, 27, 27]
 output word:['c', 'm', 'm', 'n', 'o', 'o']


In [14]:
predict_sequece("target")

INFO:tensorflow:Restoring parameters from ./abc.ckpt
原始输入： target

Source
 word 编号：[25, 17, 10, 26, 6, 25, 0]
 Input word:['t', 'a', 'r', 'g', 'e', 't', '<PAD>']

Target
 word 编号：[17, 6, 26, 10, 25, 25]
 output word:['a', 'e', 'g', 'r', 't', 't']
