# Todo
- batchnorm
- prepare batched data from full data set ( data[start:end] )

In [1]:
import numpy as np
import tensorflow as tf
import os, time

In [2]:
train_mfccPath = os.path.join('data','train','feats')
train_labelPath = os.path.join('data','train','labels')
test_mfccPath = os.path.join('data','test','feats')
test_labelPath = os.path.join('data','test','labels')

In [3]:
def load_batched_data(mfccPath, labelPath, batchSize):
    ''' return 3-element tuple: batched data (list), maxTimeLength (int), total number of samples (int)
        The shape of batched_data's elements is (batchSize, maxLength, nFeatures)
    '''
    
    def list_to_sparse_tensor(targetList):
        indices = []
        vals = []

        for tI, target in enumerate(targetList):
            for seqI, val in enumerate(target):
                indices.append([tI, seqI])
                vals.append(val)
        shape = [len(targetList), np.asarray(indices).max(axis=0)[1]+1]
        return (np.array(indices), np.array(vals), np.array(shape))
    
    def data_lists_to_batches(inputList, targetList, batchSize):
        nFeatures = inputList[0].shape[1]
        maxLength = 0
        for inp in inputList:
            maxLength = max(maxLength, inp.shape[0])

        randIdx = np.random.permutation(len(inputList))
        start, end = (0, batchSize)
        dataBatches = []

        while end <= len(inputList):
            batchSeqLengths = np.zeros(batchSize)

            for batchI, origI in enumerate(randIdx[start:end]):
                batchSeqLengths[batchI] = inputList[origI].shape[0]

            batchInputs = np.zeros((batchSize, maxLength, nFeatures))
            batchTargetList = []
            for batchI, origI in enumerate(randIdx[start:end]):
                padSecs = maxLength - inputList[origI].shape[0]
                batchInputs[batchI,:,:] = np.pad(inputList[origI], ((0,padSecs),(0,0)), 'constant', constant_values=0)
                batchTargetList.append(targetList[origI])
            dataBatches.append((batchInputs, list_to_sparse_tensor(batchTargetList), batchSeqLengths))
            start += batchSize
            end += batchSize
        return (dataBatches, maxLength)
    
    return data_lists_to_batches([np.load(os.path.join(mfccPath, fn)) for fn in os.listdir(mfccPath)],
                                [np.load(os.path.join(labelPath, fn)) for fn in os.listdir(labelPath)],
                                batchSize) + (len(os.listdir(mfccPath)),)

In [4]:
batchSize = 64
num_features = 39
num_hidden = 128
num_classes = 39 + 1
learning_rate = 0.001
n_hidden_layer = None

In [5]:
train_batchedData, train_maxTimeSteps, train_totalN = load_batched_data(train_mfccPath, train_labelPath, batchSize)
test_batchedData, test_maxTimeSteps, test_totalN = load_batched_data(test_mfccPath, test_labelPath, batchSize)

In [6]:
inputs = tf.placeholder(tf.float32, [None, None, num_features])
targetIdx = tf.placeholder(tf.int64)
targetVals = tf.placeholder(tf.int32)
targetShape = tf.placeholder(tf.int64)
targets = tf.SparseTensor(targetIdx, targetVals, targetShape)
seq_len = tf.placeholder(tf.int32, [None])

cells_fw = [tf.nn.rnn_cell.LSTMCell(num_hidden)]
cells_bw = [tf.nn.rnn_cell.LSTMCell(num_hidden)]
outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
                                                                                           sequence_length=seq_len, dtype=tf.float32)

shape = tf.shape(outputs)
outputs = tf.reshape(outputs, [-1, shape[2]])
W = tf.Variable(tf.truncated_normal([num_hidden*2, num_classes], stddev=0.1))
b = tf.Variable(tf.constant(0., shape=[num_classes]))
logits = tf.matmul(outputs, W) + b
logits = tf.reshape(logits, [shape[0], shape[1], num_classes])
logits = tf.transpose(logits, [1,0,2])

loss = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=seq_len, time_major=True)
cost = tf.reduce_mean(loss)
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost)
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)
per = tf.reduce_sum(tf.edit_distance(tf.to_int32(decoded[0]), targets, normalize=False)) / tf.to_float(tf.size(targets.values))

In [7]:
epochs = 100

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        start = time.time()
        batchRandIdx = np.random.permutation(len(train_batchedData))
        
        train_cost = train_per = 0
        for batch, batchOrgI in enumerate(batchRandIdx):
            batchInputs, batchTargetSparse, batchSeqLengths = train_batchedData[batchOrgI]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            optimizer.run(feed_dict=feedDict)
            
            if batch == len(batchRandIdx) - 1:
                train_cost, train_per = sess.run([cost, per], feedDict)
            
        test_cost = test_per = 0
        for i in range(len(test_batchedData)):
            batchInputs, batchTargetSparse, batchSeqLengths = test_batchedData[i]
            batchTargetIdx, batchTargetVals, batchTargetShape = batchTargetSparse
            feedDict = {inputs: batchInputs, targetIdx: batchTargetIdx, targetVals: batchTargetVals,
                        targetShape: batchTargetShape, seq_len: batchSeqLengths}
            
            batch_cost, batch_per = sess.run([cost, per], feedDict)
            test_cost += batch_cost
            test_per += batch_per
        
        test_cost /= len(test_batchedData)
        test_per /= len(test_batchedData)
        
        end = time.time()
        log = "Epoch {}/{}, train_cost={:.3f}, train_per={:.3f}, test_cost={:.3f}, test_per={:.3f}, time = {:.0f}s"
        print(log.format(epoch+1, epochs, train_cost, train_per, test_cost, test_per, end-start))

Epoch 1/100, train_cost=128.559, train_per=0.911, test_cost=141.041, test_per=0.907, time = 72s
Epoch 2/100, train_cost=133.743, train_per=0.848, test_cost=136.012, test_per=0.847, time = 73s
Epoch 3/100, train_cost=131.283, train_per=0.846, test_cost=132.353, test_per=0.840, time = 73s
Epoch 4/100, train_cost=132.713, train_per=0.838, test_cost=128.578, test_per=0.836, time = 73s
Epoch 5/100, train_cost=133.845, train_per=0.847, test_cost=127.092, test_per=0.835, time = 74s
Epoch 6/100, train_cost=125.500, train_per=0.831, test_cost=125.113, test_per=0.834, time = 73s
Epoch 7/100, train_cost=124.064, train_per=0.832, test_cost=122.936, test_per=0.833, time = 73s
Epoch 8/100, train_cost=120.641, train_per=0.818, test_cost=119.451, test_per=0.819, time = 73s
Epoch 9/100, train_cost=101.984, train_per=0.837, test_cost=99.118, test_per=0.844, time = 73s
Epoch 10/100, train_cost=79.823, train_per=0.816, test_cost=89.072, test_per=0.821, time = 73s
Epoch 11/100, train_cost=84.132, train_per

Epoch 88/100, train_cost=40.191, train_per=0.358, test_cost=50.316, test_per=0.412, time = 73s
Epoch 89/100, train_cost=38.882, train_per=0.374, test_cost=50.708, test_per=0.419, time = 73s
Epoch 90/100, train_cost=39.657, train_per=0.351, test_cost=50.122, test_per=0.413, time = 73s
Epoch 91/100, train_cost=41.207, train_per=0.356, test_cost=50.179, test_per=0.410, time = 73s
Epoch 92/100, train_cost=40.734, train_per=0.347, test_cost=49.933, test_per=0.408, time = 74s
Epoch 93/100, train_cost=42.481, train_per=0.367, test_cost=50.317, test_per=0.419, time = 73s
Epoch 94/100, train_cost=45.343, train_per=0.364, test_cost=50.749, test_per=0.400, time = 73s
Epoch 95/100, train_cost=44.174, train_per=0.365, test_cost=49.650, test_per=0.398, time = 73s
Epoch 96/100, train_cost=42.320, train_per=0.351, test_cost=49.433, test_per=0.401, time = 73s
Epoch 97/100, train_cost=42.693, train_per=0.365, test_cost=49.609, test_per=0.399, time = 73s
Epoch 98/100, train_cost=36.640, train_per=0.330, 