# Todo
- batchnorm
- prepare batched data from full data set ( data[start:end] )

In [1]:
import numpy as np
import tensorflow as tf
import os, time

In [2]:
train_mfccPath = os.path.join('data','train','feats')
train_labelPath = os.path.join('data','train','labels')
test_mfccPath = os.path.join('data','test','feats')
test_labelPath = os.path.join('data','test','labels')

In [3]:
def load_batched_data(mfccPath, labelPath, batchSize):
    ''' return 3-element tuple: batched data (list), maxTimeLength (int), total number of samples (int)
        The shape of batched_data's elements is (batchSize, maxLength, nFeatures)
    '''
    
    def list_to_sparse_tensor(targetList):
        indices = []
        vals = []

        for tI, target in enumerate(targetList):
            for seqI, val in enumerate(target):
                indices.append([tI, seqI])
                vals.append(val)
        shape = [len(targetList), np.asarray(indices).max(axis=0)[1]+1]
        return (np.array(indices), np.array(vals), np.array(shape))
    
    def data_lists_to_batches(inputList, targetList, batchSize):
        nFeatures = inputList[0].shape[1]
        maxLength = 0
        for inp in inputList:
            maxLength = max(maxLength, inp.shape[0])

        randIdx = np.random.permutation(len(inputList))
        start, end = (0, batchSize)
        dataBatches = []

        while end <= len(inputList):
            batchSeqLengths = np.zeros(batchSize)

            for batchI, origI in enumerate(randIdx[start:end]):
                batchSeqLengths[batchI] = inputList[origI].shape[0]

            batchInputs = np.zeros((batchSize, maxLength, nFeatures))
            batchTargetList = []
            for batchI, origI in enumerate(randIdx[start:end]):
                padSecs = maxLength - inputList[origI].shape[0]
                batchInputs[batchI,:,:] = np.pad(inputList[origI], ((0,padSecs),(0,0)), 'constant', constant_values=0)
                batchTargetList.append(targetList[origI])
            dataBatches.append((batchInputs, list_to_sparse_tensor(batchTargetList), batchSeqLengths))
            start += batchSize
            end += batchSize
        return (dataBatches, maxLength)
    
    return data_lists_to_batches([np.load(os.path.join(mfccPath, fn)) for fn in os.listdir(mfccPath)],
                                [np.load(os.path.join(labelPath, fn)) for fn in os.listdir(labelPath)],
                                batchSize) + (len(os.listdir(mfccPath)),)

In [4]:
batchSize = 128
num_features = 39
num_hidden = 128
num_classes = 39 + 1
learning_rate = 0.001
n_hidden_layer = None

In [5]:
train_batchedData, train_maxTimeSteps, train_totalN = load_batched_data(train_mfccPath, train_labelPath, batchSize)
test_batchedData, test_maxTimeSteps, test_totalN = load_batched_data(test_mfccPath, test_labelPath, batchSize)

In [6]:
inputs = tf.placeholder(tf.float32, [None, None, num_features])
targetIdx = tf.placeholder(tf.int64)
targetVals = tf.placeholder(tf.int32)
targetShape = tf.placeholder(tf.int64)
targets = tf.SparseTensor(targetIdx, targetVals, targetShape)
seq_len = tf.placeholder(tf.int32, [None])

cells_fw = [tf.nn.rnn_cell.LSTMCell(num_hidden)]
cells_bw = [tf.nn.rnn_cell.LSTMCell(num_hidden)]
outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
                                                                                           sequence_length=seq_len, dtype=tf.float32)

shape = tf.shape(outputs)
outputs = tf.reshape(outputs, [-1, shape[2]])
W = tf.Variable(tf.truncated_normal([num_hidden*2, num_classes], stddev=0.1))
b = tf.Variable(tf.constant(0., shape=[num_classes]))
logits = tf.matmul(outputs, W) + b
logits = tf.reshape(logits, [shape[0], shape[1], num_classes])
logits = tf.transpose(logits, [1,0,2])

loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len, time_major=True)
cost = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)
per = tf.reduce_sum(tf.edit_distance(tf.to_int32(decoded[0]), targets, normalize=False)) / tf.to_float(tf.size(targets.values))

In [7]:
epochs = 120

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        start = time.time()
        batchRandIdx = np.random.permutation(len(train_batchedData))
        
        train_cost = train_per = 0
        for batch, batchOrgI in enumerate(batchRandIdx):
            batchInputs, batchTargetSparse, batchSeqLengths = train_batchedData[batchOrgI]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            optimizer.run(feed_dict=feedDict)
            
            if batch == len(batchRandIdx) - 1:
                train_cost, train_per = sess.run([cost, per], feedDict)
            
        test_cost = test_per = 0
        for i in range(len(test_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = test_batchedData[i]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            
            batch_cost, batch_per = sess.run([cost, per], feedDict)
            test_cost += batch_cost
            test_per += batch_per
        
        test_cost /= len(test_batchedData)
        test_per /= len(test_batchedData)
        
        end = time.time()
        log = "Epoch {}/{}, train_cost={:.3f}, train_per={:.3f}, test_cost={:.3f}, test_per={:.3f}, time = {:.0f}s"
        print(log.format(epoch+1, epochs, train_cost, train_per, test_cost, test_per, end-start))

Epoch 1/120, train_cost=162.030, train_per=0.961, test_cost=162.989, test_per=0.960, time = 39s
Epoch 2/120, train_cost=148.483, train_per=0.923, test_cost=146.534, test_per=0.930, time = 39s
Epoch 3/120, train_cost=140.089, train_per=0.884, test_cost=140.830, test_per=0.893, time = 39s
Epoch 4/120, train_cost=134.948, train_per=0.872, test_cost=138.217, test_per=0.879, time = 39s
Epoch 5/120, train_cost=136.085, train_per=0.866, test_cost=135.837, test_per=0.869, time = 39s
Epoch 6/120, train_cost=133.789, train_per=0.864, test_cost=133.716, test_per=0.863, time = 39s
Epoch 7/120, train_cost=123.811, train_per=0.853, test_cost=131.713, test_per=0.856, time = 39s
Epoch 8/120, train_cost=124.632, train_per=0.845, test_cost=129.825, test_per=0.851, time = 40s
Epoch 9/120, train_cost=127.003, train_per=0.852, test_cost=128.454, test_per=0.852, time = 39s
Epoch 10/120, train_cost=124.944, train_per=0.846, test_cost=127.423, test_per=0.845, time = 39s
Epoch 11/120, train_cost=123.485, train

Epoch 87/120, train_cost=30.387, train_per=0.254, test_cost=44.724, test_per=0.346, time = 39s
Epoch 88/120, train_cost=28.168, train_per=0.236, test_cost=44.612, test_per=0.346, time = 39s
Epoch 89/120, train_cost=29.788, train_per=0.251, test_cost=44.565, test_per=0.341, time = 39s
Epoch 90/120, train_cost=29.034, train_per=0.253, test_cost=44.632, test_per=0.345, time = 40s
Epoch 91/120, train_cost=27.195, train_per=0.230, test_cost=44.562, test_per=0.343, time = 39s
Epoch 92/120, train_cost=26.976, train_per=0.240, test_cost=44.759, test_per=0.341, time = 40s
Epoch 93/120, train_cost=28.978, train_per=0.239, test_cost=44.555, test_per=0.343, time = 40s
Epoch 94/120, train_cost=27.863, train_per=0.240, test_cost=44.757, test_per=0.345, time = 39s
Epoch 95/120, train_cost=27.588, train_per=0.228, test_cost=44.866, test_per=0.340, time = 39s
Epoch 96/120, train_cost=27.759, train_per=0.242, test_cost=44.817, test_per=0.345, time = 39s
Epoch 97/120, train_cost=28.611, train_per=0.237, 