# Todo
- data shuffle

In [1]:
import tensorflow as tf
import numpy as np
import os, time

In [2]:
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']
phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']
mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', 'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n', 'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#', 'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#', 'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

train_featPath = os.path.join('data','train','feats','mfcc')
train_labelPath = os.path.join('data','train','labels')
test_featPath = os.path.join('data','test','feats','mfcc')
test_labelPath = os.path.join('data','test','labels')

feat_type = 'mfcc'
feats_dim = 39 if feat_type=='mfcc' else 123 # filter bank
num_classes = len(phn_61)
sos_token_idx = num_classes
eos_token_idx = num_classes + 1
voca_size = num_classes + 2

num_unit_encoder = 128
num_unit_decoder = 128
learning_rate = 0.001
n_hidden_layer = 3
decoder_embedding_dim = 16
beam_width = 5
batch_size = 128
epochs = 100

In [3]:
def load_batched_data(featPath, labelPath, batchSize):
    ''' return 3-element tuple: batched data (list), maxTimeLengths (list), total number of samples (int)
        The shape of batched_data's elements is (batchSize, maxLength, nFeatures)
    '''
    def data_lists_to_batches(inputList, targetList, batchSize):
        nFeatures = inputList[0].shape[1]
        input_target = list(zip(inputList, targetList))
        input_target.sort(key=lambda x: x[0].shape[0])
        inputList = []
        targetList = []
        for inp, tar in input_target:
            inputList.append(inp)
            targetList.append(tar)
            
        targetInList = []
        targetOutList = []
        for tar in targetList:
            targetInList.append(np.pad(tar, (1,0), 'constant', constant_values=sos_token_idx))
            targetOutList.append(np.pad(tar, (0,1), 'constant', constant_values=eos_token_idx))
    
        start, end = (0, batchSize)
        dataBatches = []
        maxLengths = []

        while end <= len(inputList):
            sourceSeqLengths = np.zeros(batchSize)
            targetSeqLengths = np.zeros(batchSize)
                
            maxLength = [0, 0]
            for batchI, i in enumerate(range(start, end)):
                sourceSeqLengths[batchI] = inputList[i].shape[0]
                maxLength[0] = max(maxLength[0], inputList[i].shape[0])
            for batchI, i in enumerate(range(start, end)):
                targetSeqLengths[batchI] = len(targetInList[i])
                maxLength[1] = max(maxLength[1], len(targetInList[i]))
            maxLengths.append(maxLength)

            batchInputs = np.zeros((batchSize, maxLength[0], nFeatures))
            batchTargetIn = np.zeros((batchSize, maxLength[1]))
            batchTargetOut = np.zeros((batchSize, maxLength[1]))
            for batchI, i in enumerate(range(start, end)):
                padSecs = maxLength[0] - inputList[i].shape[0]
                batchInputs[batchI,:,:] = np.pad(inputList[i], ((0,padSecs),(0,0)), 'constant', constant_values=0)
                
                padSecs = maxLength[1] - len(targetInList[i])
                batchTargetIn[batchI, :] = np.pad(targetInList[i], (0, padSecs), 'constant', constant_values=eos_token_idx)
                batchTargetOut[batchI, :] = np.pad(targetOutList[i], (0, padSecs), 'constant', constant_values=eos_token_idx)
            dataBatches.append((batchInputs, batchTargetIn, batchTargetOut, sourceSeqLengths, targetSeqLengths))
            start += batchSize
            end += batchSize
            
            if end > len(inputList) and start < len(inputList):
                end = len(inputList)
                batchSize = end - start
            
        return (dataBatches, maxLengths)
    
    return data_lists_to_batches([np.load(os.path.join(featPath, fn)) for fn in os.listdir(featPath)],
                                [np.load(os.path.join(labelPath, fn)) for fn in os.listdir(labelPath)],
                                batchSize) + (len(os.listdir(featPath)),)

In [4]:
train_batchedData, train_maxTimeLengths, train_totalN = load_batched_data(train_featPath, train_labelPath, batch_size)
test_batchedData, test_maxTimeLengths, test_totalN = load_batched_data(test_featPath, test_labelPath, batch_size)

In [5]:
phn_61_tensor = tf.constant(phn_61, dtype=tf.string)
phn_39_tensor = tf.constant(phn_39, dtype=tf.string)
mapping_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(mapping.keys()), list(mapping.values())), default_value='')

inputs = tf.placeholder(tf.float32, shape=[None, None, feats_dim])
targets_in = tf.placeholder(tf.int32, shape=[None, None])
targets_out = tf.placeholder(tf.int32, shape=[None, None])
source_seq_len = tf.placeholder(tf.int32, shape=[None])
target_seq_len = tf.placeholder(tf.int32, shape=[None])
keep_prob = tf.placeholder(tf.float32)

with tf.variable_scope('root'):
    batch_size_tensor = tf.shape(inputs)[0]
    embedding_decoder = tf.Variable(tf.random_uniform([voca_size, decoder_embedding_dim], dtype=tf.float32))
    decoder_emb_inp = tf.nn.embedding_lookup(embedding_decoder, targets_in)

    cells_fw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_unit_encoder, kernel_initializer=tf.orthogonal_initializer()), output_keep_prob=keep_prob) for _ in range(n_hidden_layer)]
    cells_bw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_unit_encoder, kernel_initializer=tf.orthogonal_initializer()), output_keep_prob=keep_prob) for _ in range(n_hidden_layer)]
    encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, sequence_length=source_seq_len, dtype=tf.float32)

    memory = encoder_outputs
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_unit_decoder, memory, memory_sequence_length=source_seq_len)
    #decoder_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_unit_decoder), output_keep_prob=keep_prob) for _ in range(n_hidden_layer)])
    decoder_cell = tf.nn.rnn_cell.GRUCell(num_unit_decoder, kernel_initializer=tf.orthogonal_initializer())
    att_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_unit_decoder)
    helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, target_seq_len, time_major=False)
    decoder = tf.contrib.seq2seq.BasicDecoder(att_decoder_cell, helper, initial_state=att_decoder_cell.zero_state(batch_size_tensor, dtype=tf.float32))
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False)

    output_layer = tf.contrib.keras.layers.Dense(voca_size, use_bias=False)
    logits = output_layer(outputs.rnn_output)
    max_time = tf.shape(targets_out)[1]
    target_weights = tf.sequence_mask(target_seq_len, max_time, dtype=logits.dtype)
    crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets_out, logits=logits)
    loss = tf.reduce_sum(crossent * target_weights) / tf.to_float(batch_size_tensor)

    params = tf.trainable_variables()
    gradients = tf.gradients(loss, params)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    update_step = optimizer.apply_gradients(zip(clipped_gradients, params))

with tf.variable_scope('root', reuse=True):
    tiled_memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width)
    tiled_source_seq_len = tf.contrib.seq2seq.tile_batch(source_seq_len, multiplier=beam_width)
    tiled_batch_size = batch_size_tensor * beam_width
    tiled_attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_unit_decoder, tiled_memory, memory_sequence_length=tiled_source_seq_len)
    tiled_att_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, tiled_attention_mechanism, attention_layer_size=num_unit_decoder)
    decoder_initial_state = tiled_att_decoder_cell.zero_state(tiled_batch_size, tf.float32)

    beam_decoder = tf.contrib.seq2seq.BeamSearchDecoder(tiled_att_decoder_cell, embedding_decoder, tf.fill([batch_size_tensor], sos_token_idx),
                                                       eos_token_idx, decoder_initial_state, beam_width, output_layer=output_layer)
    decoded, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode(beam_decoder, maximum_iterations=100, output_time_major=False)
    predicted_ids = decoded.predicted_ids # shape = [batch, max_time, beam_width]
    
    def map_to_reduced_phn(p):
        val = mapping_table.lookup(phn_61_tensor[p])
        f1 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(val, phn_39_tensor))))
        f2 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(phn_61_tensor[p], phn_39_tensor))))
        return tf.cond(tf.not_equal(val, ''), f1, f2)
    
    indices = tf.to_int64(tf.where(tf.logical_and(tf.not_equal(predicted_ids[:,:,0], -1), tf.not_equal(predicted_ids[:,:,0], eos_token_idx))))
    vals = tf.to_int32(tf.gather_nd(predicted_ids[:,:,0], indices))
    shape = tf.to_int64(tf.shape(predicted_ids[:,:,0]))
    decoded_sparse = tf.SparseTensor(indices, tf.map_fn(map_to_reduced_phn, vals), shape)
    
    indices = tf.to_int64(tf.where(tf.not_equal(targets_out, eos_token_idx)))
    vals = tf.to_int32(tf.gather_nd(targets_out, indices))
    shape = tf.to_int64(tf.shape(targets_out))
    targets_out_sparse = tf.SparseTensor(indices, tf.map_fn(map_to_reduced_phn, vals), shape)

    per = tf.reduce_sum(tf.edit_distance(decoded_sparse, targets_out_sparse, normalize=False)) / tf.to_float(tf.size(targets_out_sparse.values))
    
saver = tf.train.Saver()

In [6]:
with tf.Session() as sess:
    sess.run(mapping_table.init)
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        start = time.time()
        batchRandIdx = np.random.permutation(len(train_batchedData))
        
        for batch, batchOrgI in enumerate(batchRandIdx):
            batchInputs, batchTargetIn, batchTargetOut, sourceSeqLen, targetSeqLen = train_batchedData[batchOrgI]
            feedDict = {inputs: batchInputs, targets_in: batchTargetIn, targets_out: batchTargetOut,
                        source_seq_len: sourceSeqLen, target_seq_len: targetSeqLen, keep_prob: 0.6}
            update_step.run(feed_dict=feedDict)
            
            if batch == len(batchRandIdx) - 1:
                feedDict[keep_prob] = 1.0
                train_cost = sess.run(loss, feedDict)
            
        test_cost = 0
        for i in range(len(test_batchedData)):
            batchInputs, batchTargetIn, batchTargetOut, sourceSeqLen, targetSeqLen = test_batchedData[i]
            feedDict = {inputs: batchInputs, targets_in: batchTargetIn, targets_out: batchTargetOut,
                        source_seq_len: sourceSeqLen, target_seq_len: targetSeqLen, keep_prob: 1.0}
            
            batch_cost = sess.run(loss, feedDict)
            test_cost += batch_cost
        test_cost /= len(test_batchedData)
        
        end = time.time()
        log = "Epoch {}/{}, train_cost={:.3f}, test_cost={:.3f}, time = {:.0f}s"
        print(log.format(epoch+1, epochs, train_cost, test_cost, end-start))
        
        if (epoch+1) % 5 == 0:
            save_path = saver.save(sess, 'model/seq2seq/model.ckpt')
            #print('Epoch {} - model saved in file: {}'.format(epoch+1, save_path))

Epoch 1/100, train_cost=109.563, test_cost=151.554, time = 59s
Epoch 2/100, train_cost=81.121, test_cost=144.629, time = 58s
Epoch 3/100, train_cost=164.809, test_cost=133.549, time = 58s
Epoch 4/100, train_cost=153.571, test_cost=124.007, time = 58s
Epoch 5/100, train_cost=94.404, test_cost=120.609, time = 58s
Epoch 6/100, train_cost=175.919, test_cost=117.658, time = 59s
Epoch 7/100, train_cost=91.327, test_cost=113.345, time = 58s
Epoch 8/100, train_cost=92.332, test_cost=110.889, time = 59s
Epoch 9/100, train_cost=132.245, test_cost=106.820, time = 58s
Epoch 10/100, train_cost=84.041, test_cost=104.936, time = 58s
Epoch 11/100, train_cost=91.236, test_cost=96.582, time = 59s
Epoch 12/100, train_cost=110.556, test_cost=89.209, time = 59s
Epoch 13/100, train_cost=65.421, test_cost=85.337, time = 59s
Epoch 14/100, train_cost=59.316, test_cost=79.227, time = 59s
Epoch 15/100, train_cost=53.269, test_cost=71.181, time = 58s
Epoch 16/100, train_cost=91.395, test_cost=65.152, time = 59s
E

KeyboardInterrupt: 

In [7]:
with tf.Session() as sess:
    saver.restore(sess, 'model/seq2seq/model.ckpt')
    sess.run(mapping_table.init)
    
    start = time.time()
    train_cost = train_per = 0
    for i in range(len(train_batchedData)):
            batchInputs, batchTargetIn, batchTargetOut, sourceSeqLen, targetSeqLen = train_batchedData[i]
            feedDict = {inputs: batchInputs, targets_in: batchTargetIn, targets_out: batchTargetOut,
                        source_seq_len: sourceSeqLen, target_seq_len: targetSeqLen, keep_prob: 1.0}
            batch_cost, batch_per = sess.run([loss, per], feedDict)
            train_cost += batch_cost
            train_per += batch_per
    
    test_cost = test_per = 0
    for i in range(len(test_batchedData)):
            batchInputs, batchTargetIn, batchTargetOut, sourceSeqLen, targetSeqLen = test_batchedData[i]
            feedDict = {inputs: batchInputs, targets_in: batchTargetIn, targets_out: batchTargetOut,
                        source_seq_len: sourceSeqLen, target_seq_len: targetSeqLen, keep_prob: 1.0}
            batch_cost, batch_per = sess.run([loss, per], feedDict)
            test_cost += batch_cost
            test_per += batch_per
            
    train_cost /= len(train_batchedData)
    train_per /= len(train_batchedData)
    test_cost /= len(test_batchedData)
    test_per /= len(test_batchedData)
    
    end = time.time()
    log = "Epoch {}/{}, train_cost={:.3f}, train_per={:.3f}, test_cost={:.3f}, test_per={:.3f}, time = {:.0f}s"
    print(log.format(epoch+1, epochs, train_cost, train_per, test_cost, test_per, end-start))

INFO:tensorflow:Restoring parameters from model/seq2seq/model.ckpt
Epoch 61/100, train_cost=11.305, train_per=0.069, test_cost=37.788, test_per=0.244, time = 229s
