# Connectionist Temporal Classification

In [1]:
import numpy as np
import tensorflow as tf
import os, time

In [2]:
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']
phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']
mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', 'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n', 'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#', 'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#', 'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

train_featPath = os.path.join('data','train','feats','mfcc')
train_labelPath = os.path.join('data','train','labels')
test_featPath = os.path.join('data','test','feats','mfcc')
test_labelPath = os.path.join('data','test','labels')

feat_type = 'mfcc'
batchSize = 128
feats_dim = 39 if feat_type=='mfcc' else 123 # filter bank
num_hidden = 128
num_classes = len(phn_61)+1 # num of phoneme + blank
learning_rate = 0.001
n_hidden_layer = 3
epochs = 50

In [3]:
def load_batched_data(featPath, labelPath, batchSize):
    ''' return 3-element tuple: batched data (list), maxTimeLengths (list), total number of samples (int)
        The shape of batched_data's elements is (batchSize, maxLength, nFeatures)
    '''
    def list_to_sparse_tensor(targetList):
        indices = []
        vals = []

        for tI, target in enumerate(targetList):
            for seqI, val in enumerate(target):
                indices.append([tI, seqI])
                vals.append(val)
        shape = [len(targetList), np.asarray(indices).max(axis=0)[1]+1]
        return (np.array(indices), np.array(vals), np.array(shape))
    
    def data_lists_to_batches(inputList, targetList, batchSize):
        nFeatures = inputList[0].shape[1]
        input_target = list(zip(inputList, targetList))
        input_target.sort(key=lambda x: x[0].shape[0])
        inputList = []
        targetList = []
        for inp, tar in input_target:
            inputList.append(inp)
            targetList.append(tar)
    
        start, end = (0, batchSize)
        dataBatches = []
        maxLengths = []

        while end <= len(inputList):
            batchSeqLengths = np.zeros(batchSize)
                
            maxLength = 0
            for batchI, i in enumerate(range(start, end)):
                batchSeqLengths[batchI] = inputList[i].shape[0]
                maxLength = max(maxLength, inputList[i].shape[0])
            maxLengths.append(maxLength)

            batchInputs = np.zeros((batchSize, maxLength, nFeatures))
            batchTargetList = []
            for batchI, i in enumerate(range(start, end)):
                padSecs = maxLength - inputList[i].shape[0]
                batchInputs[batchI,:,:] = np.pad(inputList[i], ((0,padSecs),(0,0)), 'constant', constant_values=0)
                batchTargetList.append(targetList[i])
            dataBatches.append((batchInputs, list_to_sparse_tensor(batchTargetList), batchSeqLengths))
            start += batchSize
            end += batchSize
            
            if end > len(inputList) and start < len(inputList):
                end = len(inputList)
                batchSize = end - start
        return (dataBatches, maxLengths)
    
    return data_lists_to_batches([np.load(os.path.join(featPath, fn)) for fn in os.listdir(featPath)],
                                [np.load(os.path.join(labelPath, fn)) for fn in os.listdir(labelPath)],
                                batchSize) + (len(os.listdir(featPath)),)

In [4]:
train_batchedData, train_maxTimeLengths, train_totalN = load_batched_data(train_featPath, train_labelPath, batchSize)
test_batchedData, test_maxTimeLengths, test_totalN = load_batched_data(test_featPath, test_labelPath, batchSize)

In [5]:
phn_61_tensor = tf.constant(phn_61, dtype=tf.string)
phn_39_tensor = tf.constant(phn_39, dtype=tf.string)
mapping_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(mapping.keys()), list(mapping.values())), default_value='')

inputs = tf.placeholder(tf.float32, [None, None, feats_dim]) # shape=[batch, max_time, features]
targets = tf.sparse_placeholder(tf.int32)
seq_len = tf.placeholder(tf.int32, [None])
keep_prob = tf.placeholder(tf.float32)

cells_fw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_hidden), output_keep_prob=keep_prob) for _ in range(n_hidden_layer)]
cells_bw = [tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_hidden), output_keep_prob=keep_prob) for _ in range(n_hidden_layer)]
outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, dtype=tf.float32, sequence_length=seq_len)

shape = tf.shape(outputs)
outputs = tf.reshape(outputs, [-1, shape[2]])
W = tf.Variable(tf.truncated_normal([num_hidden*2, num_classes], stddev=0.1))
b = tf.Variable(tf.constant(0., shape=[num_classes]))
logits = tf.matmul(outputs, W) + b
logits = tf.reshape(logits, [shape[0], shape[1], num_classes])
logits = tf.transpose(logits, [1,0,2]) # time major shape

loss = tf.reduce_mean(tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len, time_major=True))
params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
optimizer = tf.train.AdamOptimizer(learning_rate)
update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len)
decoded = tf.to_int32(decoded[0])

def map_to_reduced_phn(p):
    val = mapping_table.lookup(phn_61_tensor[p])
    f1 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(val, phn_39_tensor))))
    f2 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(phn_61_tensor[p], phn_39_tensor))))
    return tf.cond(tf.not_equal(val, ''), f1, f2)
        
decoded_reduced = tf.SparseTensor(decoded.indices, tf.map_fn(map_to_reduced_phn, decoded.values), decoded.dense_shape)
targets_reduced = tf.SparseTensor(targets.indices, tf.map_fn(map_to_reduced_phn, targets.values), targets.dense_shape)
per = tf.reduce_sum(tf.edit_distance(decoded_reduced, targets_reduced, normalize=False)) / tf.to_float(tf.size(targets_reduced.values))

saver = tf.train.Saver()

In [6]:
with tf.Session() as sess:
    sess.run(mapping_table.init)
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        start = time.time()
        batchRandIdx = np.random.permutation(len(train_batchedData))
        
        for batch, batchOrgI in enumerate(batchRandIdx):
            batchInputs, batchTargetSparse, batchSeqLengths = train_batchedData[batchOrgI]
            feedDict = {inputs: batchInputs, targets: batchTargetSparse, seq_len: batchSeqLengths, keep_prob: 0.6}
            update_step.run(feed_dict=feedDict)
            
            if batch == len(batchRandIdx) - 1:
                feedDict[keep_prob] = 1.0
                train_cost = sess.run(loss, feedDict)
            
        test_cost = test_per = 0
        for i in range(len(test_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = test_batchedData[i]
            feedDict = {inputs: batchInputs, targets: batchTargetSparse, seq_len: batchSeqLengths, keep_prob:1.0}
            
            batch_cost = sess.run(loss, feedDict)
            test_cost += batch_cost
        
        test_cost /= len(test_batchedData)
        
        end = time.time()
        log = "Epoch {}/{}, train_cost={:.3f}, test_cost={:.3f}, time = {:.0f}s"
        print(log.format(epoch+1, epochs, train_cost, test_cost, end-start))
        
        if (epoch+1) % 10 == 0:
            saver.save(sess, 'model/model.ckpt')

Epoch 1/50, train_cost=135.213, test_cost=154.960, time = 53s
Epoch 2/50, train_cost=99.441, test_cost=124.134, time = 53s
Epoch 3/50, train_cost=82.660, test_cost=104.870, time = 54s
Epoch 4/50, train_cost=76.198, test_cost=94.064, time = 55s
Epoch 5/50, train_cost=128.602, test_cost=85.950, time = 55s
Epoch 6/50, train_cost=56.312, test_cost=79.640, time = 54s
Epoch 7/50, train_cost=91.204, test_cost=75.002, time = 55s
Epoch 8/50, train_cost=107.524, test_cost=72.128, time = 55s
Epoch 9/50, train_cost=40.024, test_cost=68.252, time = 55s
Epoch 10/50, train_cost=71.514, test_cost=66.389, time = 54s
Epoch 11/50, train_cost=67.001, test_cost=62.702, time = 53s
Epoch 12/50, train_cost=58.535, test_cost=60.919, time = 53s
Epoch 13/50, train_cost=41.886, test_cost=57.760, time = 53s
Epoch 14/50, train_cost=47.788, test_cost=55.223, time = 53s
Epoch 15/50, train_cost=49.230, test_cost=53.121, time = 53s
Epoch 16/50, train_cost=67.848, test_cost=51.610, time = 53s
Epoch 17/50, train_cost=22.

KeyboardInterrupt: 

In [7]:
with tf.Session() as sess:
    saver.restore(sess, 'model/model.ckpt')
    sess.run(mapping_table.init)
    
    start = time.time()
    train_cost = train_per = 0
    for i in range(len(train_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = train_batchedData[i]
            feedDict = {inputs: batchInputs, targets: batchTargetSparse, seq_len: batchSeqLengths, keep_prob:1.0}
            batch_cost, batch_per = sess.run([loss, per], feedDict)
            train_cost += batch_cost
            train_per += batch_per
    
    test_cost = test_per = 0
    for i in range(len(test_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = test_batchedData[i]
            feedDict = {inputs: batchInputs, targets: batchTargetSparse, seq_len: batchSeqLengths, keep_prob:1.0}
            batch_cost, batch_per = sess.run([loss, per], feedDict)
            test_cost += batch_cost
            test_per += batch_per
            
    train_cost /= len(train_batchedData)
    train_per /= len(train_batchedData)
    test_cost /= len(test_batchedData)
    test_per /= len(test_batchedData)
    
    end = time.time()
    log = "Epoch {}/{}, train_cost={:.3f}, train_per={:.3f}, test_cost={:.3f}, test_per={:.3f}, time = {:.0f}s"
    print(log.format(epoch+1, epochs, train_cost, train_per, test_cost, test_per, end-start))

INFO:tensorflow:Restoring parameters from model/model.ckpt
Epoch 41/50, train_cost=24.743, train_per=0.164, test_cost=42.794, test_per=0.260, time = 213s
