# Todo
- batchnorm
- prepare batched data from full data set ( data[start:end] )

In [1]:
import numpy as np
import tensorflow as tf
import os, time

In [2]:
train_mfccPath = os.path.join('data','train','feats')
train_labelPath = os.path.join('data','train','labels')
test_mfccPath = os.path.join('data','test','feats')
test_labelPath = os.path.join('data','test','labels')

In [3]:
def load_batched_data(mfccPath, labelPath, batchSize):
    ''' return 3-element tuple: batched data (list), maxTimeLength (int), total number of samples (int)
        The shape of batched_data's elements is (batchSize, maxLength, nFeatures)
    '''
    
    def list_to_sparse_tensor(targetList):
        indices = []
        vals = []

        for tI, target in enumerate(targetList):
            for seqI, val in enumerate(target):
                indices.append([tI, seqI])
                vals.append(val)
        shape = [len(targetList), np.asarray(indices).max(axis=0)[1]+1]
        return (np.array(indices), np.array(vals), np.array(shape))
    
    def data_lists_to_batches(inputList, targetList, batchSize):
        nFeatures = inputList[0].shape[1]
        maxLength = 0
        for inp in inputList:
            maxLength = max(maxLength, inp.shape[0])

        randIdx = np.random.permutation(len(inputList))
        start, end = (0, batchSize)
        dataBatches = []

        while end <= len(inputList):
            batchSeqLengths = np.zeros(batchSize)
                
            for batchI, origI in enumerate(randIdx[start:end]):
                batchSeqLengths[batchI] = inputList[origI].shape[0]

            batchInputs = np.zeros((batchSize, maxLength, nFeatures))
            batchTargetList = []
            for batchI, origI in enumerate(randIdx[start:end]):
                padSecs = maxLength - inputList[origI].shape[0]
                batchInputs[batchI,:,:] = np.pad(inputList[origI], ((0,padSecs),(0,0)), 'constant', constant_values=0)
                batchTargetList.append(targetList[origI])
            dataBatches.append((batchInputs, list_to_sparse_tensor(batchTargetList), batchSeqLengths))
            start += batchSize
            end += batchSize
        return (dataBatches, maxLength)
    
    return data_lists_to_batches([np.load(os.path.join(mfccPath, fn)) for fn in os.listdir(mfccPath)],
                                [np.load(os.path.join(labelPath, fn)) for fn in os.listdir(labelPath)],
                                batchSize) + (len(os.listdir(mfccPath)),)

In [4]:
batchSize = 128
num_features = 39
num_hidden = 128
num_classes = 39 + 1
learning_rate = 0.001
n_hidden_layer = 3

In [5]:
train_batchedData, train_maxTimeSteps, train_totalN = load_batched_data(train_mfccPath, train_labelPath, batchSize)
test_batchedData, test_maxTimeSteps, test_totalN = load_batched_data(test_mfccPath, test_labelPath, 64)

In [6]:
inputs = tf.placeholder(tf.float32, [None, None, num_features])
targetIdx = tf.placeholder(tf.int64)
targetVals = tf.placeholder(tf.int32)
targetShape = tf.placeholder(tf.int64)
targets = tf.SparseTensor(targetIdx, targetVals, targetShape)
seq_len = tf.placeholder(tf.int32, [None])

cells_fw = [tf.nn.rnn_cell.LSTMCell(num_hidden) for _ in range(n_hidden_layer)]
cells_bw = [tf.nn.rnn_cell.LSTMCell(num_hidden) for _ in range(n_hidden_layer)]
outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, dtype=tf.float32, sequence_length=seq_len)

shape = tf.shape(outputs)
outputs = tf.reshape(outputs, [-1, shape[2]])
W = tf.Variable(tf.truncated_normal([num_hidden*2, num_classes], stddev=0.1))
b = tf.Variable(tf.constant(0., shape=[num_classes]))
logits = tf.matmul(outputs, W) + b
logits = tf.reshape(logits, [shape[0], shape[1], num_classes])
logits = tf.transpose(logits, [1,0,2])

loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len, time_major=True)
cost = tf.reduce_mean(loss)
params = tf.trainable_variables()
gradients = tf.gradients(cost, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
optimizer = tf.train.AdamOptimizer(learning_rate)
update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)
per = tf.reduce_sum(tf.edit_distance(tf.to_int32(decoded[0]), targets, normalize=False)) / tf.to_float(tf.size(targets.values))

In [7]:
epochs = 120

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        start = time.time()
        batchRandIdx = np.random.permutation(len(train_batchedData))
        
        train_cost = train_per = 0
        for batch, batchOrgI in enumerate(batchRandIdx):
            batchInputs, batchTargetSparse, batchSeqLengths = train_batchedData[batchOrgI]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            update_step.run(feed_dict=feedDict)
            
            if batch == len(batchRandIdx) - 1:
                train_cost, train_per = sess.run([cost, per], feedDict)
            
        test_cost = test_per = 0
        for i in range(len(test_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = test_batchedData[i]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            
            batch_cost, batch_per = sess.run([cost, per], feedDict)
            test_cost += batch_cost
            test_per += batch_per
        
        test_cost /= len(test_batchedData)
        test_per /= len(test_batchedData)
        
        end = time.time()
        log = "Epoch {}/{}, train_cost={:.3f}, train_per={:.3f}, test_cost={:.3f}, test_per={:.3f}, time = {:.0f}s"
        print(log.format(epoch+1, epochs, train_cost, train_per, test_cost, test_per, end-start))

Epoch 1/120, train_cost=124.297, train_per=1.000, test_cost=125.121, test_per=1.000, time = 114s
Epoch 2/120, train_cost=109.059, train_per=0.999, test_cost=109.020, test_per=1.000, time = 115s
Epoch 3/120, train_cost=81.753, train_per=0.858, test_cost=83.085, test_per=0.856, time = 116s
Epoch 4/120, train_cost=66.654, train_per=0.728, test_cost=67.788, test_per=0.744, time = 115s
Epoch 5/120, train_cost=59.508, train_per=0.658, test_cost=60.095, test_per=0.670, time = 116s
Epoch 6/120, train_cost=51.146, train_per=0.614, test_cost=54.804, test_per=0.634, time = 116s
Epoch 7/120, train_cost=44.707, train_per=0.502, test_cost=50.538, test_per=0.538, time = 116s
Epoch 8/120, train_cost=44.400, train_per=0.432, test_cost=47.478, test_per=0.454, time = 116s
Epoch 9/120, train_cost=39.020, train_per=0.366, test_cost=45.082, test_per=0.396, time = 116s
Epoch 10/120, train_cost=38.661, train_per=0.339, test_cost=43.659, test_per=0.365, time = 116s
Epoch 11/120, train_cost=36.629, train_per=0.

KeyboardInterrupt: 