* batch normalization degrade performance when used with residual convnet
* he_normal, he_uniform initializer slow down convergence

In [1]:
import tensorflow as tf
import numpy as np
import time

In [2]:
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']
phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']
mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', 'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n', 'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#', 'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#', 'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

TRAIN_FILE = './data/fbank/train.tfrecords'
TEST_FILE = './data/fbank/test.tfrecords'
checkpoints_path = './model/cnn+ctc/ckpt'

feat_type = 'fbank'
feats_dim = 39 if feat_type=='mfcc' else 123 # log filter bank + energy term

batch_size = 20
num_hidden = 128
n_hidden_layer = 3
learning_rate = 0.0001
num_classes = len(phn_61)+1 # num of phoneme + blank
epochs = 100

In [3]:
class Model(object):
    def __init__(self, batch_size, num_hidden, n_hidden_layer, feats_dim, num_classes, learning_rate=0.001, 
                 phn_61=None, phn_39=None, mapping=None, file_type=None, model_type=None):
        
        iterator = self._get_iterator(batch_size, feats_dim, file_type, model_type)
        self.iterator_initializer = iterator.initializer
        
        batched_data = iterator.get_next()
        features = batched_data[0]
        labels = batched_data[1]
        feats_seq_len = tf.to_int32(batched_data[2])
        labels_sparse = self._get_sparse_tensor(labels, -1)
        
        logits = self._compute_logits(features, feats_seq_len, num_hidden, n_hidden_layer, num_classes)
        self.d = logits
        self.loss = self._compute_loss(labels_sparse, feats_seq_len, logits)
        if model_type=='train':
            self.update_step = self._get_update_step(self.loss, learning_rate)
        else:
            self.per = self._compute_per(labels_sparse, feats_seq_len, logits, phn_61, phn_39, mapping)
            
        self.saver = tf.train.Saver()
        
    def _get_sparse_tensor(self, dense, default):
        indices = tf.to_int64(tf.where(tf.not_equal(dense, default)))
        vals = tf.to_int32(tf.gather_nd(dense, indices))
        shape = tf.to_int64(tf.shape(dense))
        return tf.SparseTensor(indices, vals, shape)
                
    def _get_iterator(self, batch_size, feats_dim, file_type, model_type):
        dataset = tf.contrib.data.TFRecordDataset(file_type)
        context_features = {'feats_seq_len': tf.FixedLenFeature([], dtype=tf.int64),
                           'labels_seq_len': tf.FixedLenFeature([], dtype=tf.int64)}
        sequence_features = {'features': tf.FixedLenSequenceFeature([feats_dim], dtype=tf.float32),
                            'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)}
        dataset = dataset.map(lambda serialized_example: tf.parse_single_sequence_example(serialized_example,
                                                                        context_features=context_features,
                                                                        sequence_features=sequence_features))
        dataset = dataset.map(lambda context, sequence: (sequence['features'], sequence['labels'],
                                                        context['feats_seq_len'], context['labels_seq_len']))
        def batching_func(x):
            return x.padded_batch(batch_size,
                                 padded_shapes=(tf.TensorShape([None, feats_dim]),
                                               tf.TensorShape([None]),
                                               tf.TensorShape([]),
                                               tf.TensorShape([])),
                                 padding_values=(tf.cast(0, tf.float32),
                                                tf.cast(-1, tf.int64),
                                                tf.cast(0, tf.int64),
                                                tf.cast(0, tf.int64)))
        def key_func(features, labels, feats_seq_len, labels_seq_len):
            f0 = lambda: tf.constant(0, tf.int64)
            f1 = lambda: tf.constant(1, tf.int64)
            f2 = lambda: tf.constant(2, tf.int64)
            f3 = lambda: tf.constant(3, tf.int64)
            f4 = lambda: tf.constant(4, tf.int64)
            f5 = lambda: tf.constant(5, tf.int64)
            f6 = lambda: tf.constant(6, tf.int64)
            
            return tf.case([(tf.less_equal(feats_seq_len, 200), f0),
                   (tf.less_equal(feats_seq_len, 250), f1),
                   (tf.less_equal(feats_seq_len, 300), f2),
                   (tf.less_equal(feats_seq_len, 350), f3),
                   (tf.less_equal(feats_seq_len, 400), f4),
                   (tf.less_equal(feats_seq_len, 500), f5)], default=f6)
        def reduce_func(bucket_id, windowed_data):
            return batching_func(windowed_data)
        
        if model_type=='train':
            dataset = dataset.shuffle(10000)
            batched_dataset = dataset.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=batch_size)
            batched_dataset = batched_dataset.shuffle(10000)
        else:
            batched_dataset = batching_func(dataset)
            
        return batched_dataset.make_initializable_iterator()
    
    def _compute_logits(self, features, feats_seq_len, num_hidden, n_hidden_layer, num_classes):
        self.keep_prob = tf.placeholder(tf.float32)
        self.training = tf.placeholder(tf.bool)
        
        features = tf.stack(tf.split(features, num_or_size_splits=3, axis=-1), axis=0) # shape = [3, batch, max_time, feats_dim/3]
        features = tf.transpose(features, [1,3,2,0]) # shape = [batch, feats_dim/3, max_time, channels]
        
        # input shape = [batch, in_height, in_width, in_channels]
        # filter shape = [filter_height, filter_width, in_channels, out_channels]
        # strides shape = input shape = ksize shape
        
        conv = tf.layers.conv2d(features, filters=128, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv = tf.layers.max_pooling2d(conv, pool_size=(3,1), strides=(3,1), padding='valid') # do not use SAME!!!!
        conv = tf.layers.dropout(conv, rate=1-self.keep_prob)
        
        inp = conv
        conv1 = tf.layers.conv2d(inp, filters=128, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv1 = tf.layers.dropout(conv1, rate=1-self.keep_prob)
        conv1 = tf.layers.conv2d(conv1, filters=128, kernel_size=(3,5), strides=(1,1), padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv1 = conv1 + inp
        conv1 = tf.nn.relu(conv1)
        conv1= tf.layers.dropout(conv1, rate=1-self.keep_prob)
        
        inp = conv1
        conv2 = tf.layers.conv2d(inp, filters=128, kernel_size=(3,5), activation=tf.nn.relu, strides=(1,1), padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv2 = tf.layers.dropout(conv2, rate=1-self.keep_prob)
        conv2 = tf.layers.conv2d(conv2, filters=128, kernel_size=(3,5), strides=(1,1), padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv2 = conv2 + inp
        conv2 = tf.nn.relu(conv2)
        conv2 = tf.layers.dropout(conv2, rate=1-self.keep_prob)
        
        inp = conv2
        inp = tf.layers.conv2d(inp, filters=256, kernel_size=(1,1), strides=(1,1), padding='same', activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(inp, filters=256, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv3 = tf.layers.dropout(conv3, rate=1-self.keep_prob)
        conv3 = tf.layers.conv2d(conv3, filters=256, kernel_size=(3,5), strides=(1,1), padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv3 = conv3 + inp
        conv3 = tf.nn.relu(conv3)
        conv3 = tf.layers.dropout(conv3, rate=1-self.keep_prob)
        
        inp = conv3
        conv4 = tf.layers.conv2d(inp, filters=256, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())   
        conv4 = tf.layers.dropout(conv4, rate=1-self.keep_prob)
        conv4 = tf.layers.conv2d(conv4, filters=256, kernel_size=(3,5), strides=(1,1), padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv4 = conv4 + inp
        conv4 = tf.nn.relu(conv4)
        conv4 = tf.layers.dropout(conv4, rate=1-self.keep_prob)
        
        inp = conv4
        conv5 = tf.layers.conv2d(inp, filters=256, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv5 = tf.layers.dropout(conv5, rate=1-self.keep_prob)
        conv5 = tf.layers.conv2d(conv5, filters=256, kernel_size=(3,5), strides=(1,1), activation=tf.nn.relu, padding='same', kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        conv5 = conv5 + inp
        conv5 = tf.nn.relu(conv5)
        conv5 = tf.layers.dropout(conv5, rate=1-self.keep_prob)
        
        conv_final = tf.transpose(conv5, [0,2,1,3]) # shape = [batch, width, height, channels] = [None, None, 13, 256]
        conv_flattend = tf.reshape(conv_final, [tf.shape(conv_final)[0], tf.shape(conv_final)[1], 13*256]) # 41 // 3 = 13
        
        # shape = [batch, width, 1024] = [batch, max_time, 1024]
        fc1 = tf.layers.dense(conv_flattend, 1024, activation=tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.glorot_normal())
        fc1 = tf.layers.dropout(fc1, rate=1-self.keep_prob)
        
        fc2 = tf.layers.dense(fc1, 1024, activation=tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.glorot_normal())      
        fc2 = tf.layers.dropout(fc2, rate=1-self.keep_prob)
        
        return tf.layers.dense(fc2, num_classes)
    
    def _compute_loss(self, labels_sparse, feats_seq_len, logits):
        return tf.reduce_mean(tf.nn.ctc_loss(labels=labels_sparse, inputs=logits, sequence_length=feats_seq_len, time_major=False))
    
    def _get_update_step(self, loss, learning_rate):
        #reg_term = 0.01* tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        #loss += reg_term
        
        params = tf.trainable_variables()
        gradients = tf.gradients(loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        return optimizer.apply_gradients(zip(clipped_gradients, params))
    
    def _compute_per(self, labels_sparse, feats_seq_len, logits, phn_61, phn_39, mapping):
        phn_61_tensor = tf.constant(phn_61, dtype=tf.string)
        phn_39_tensor = tf.constant(phn_39, dtype=tf.string)
        mapping_table = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(list(mapping.keys()), list(mapping.values())), default_value='')
        self.mapping_table_init = mapping_table.init
        
        logits = tf.transpose(logits, [1,0,2])
        decoded, _ = tf.nn.ctc_greedy_decoder(logits, feats_seq_len)
        decoded = tf.to_int32(decoded[0])
        
        def map_to_reduced_phn(p):
            val = mapping_table.lookup(phn_61_tensor[p])
            f1 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(val, phn_39_tensor))))
            f2 = lambda: tf.to_int32(tf.reduce_min(tf.where(tf.equal(phn_61_tensor[p], phn_39_tensor))))
            return tf.cond(tf.not_equal(val, ''), f1, f2)

        decoded_reduced = tf.SparseTensor(decoded.indices, tf.map_fn(map_to_reduced_phn, decoded.values), decoded.dense_shape)
        labels_reduced = tf.SparseTensor(labels_sparse.indices, tf.map_fn(map_to_reduced_phn, labels_sparse.values), labels_sparse.dense_shape)
        return tf.reduce_sum(tf.edit_distance(decoded_reduced, labels_reduced, normalize=False)) / tf.to_float(tf.size(labels_reduced.values))

In [4]:
train_graph = tf.Graph()
eval_graph = tf.Graph()

with train_graph.as_default():
    train_model = Model(batch_size, num_hidden, n_hidden_layer, feats_dim, num_classes, learning_rate,
                        file_type=TRAIN_FILE, model_type='train')
    initializer = tf.global_variables_initializer()
    
with eval_graph.as_default():
    eval_model = Model(batch_size, num_hidden, n_hidden_layer, feats_dim, num_classes, 
                       phn_61=phn_61, phn_39=phn_39, mapping=mapping, file_type=TEST_FILE, model_type='eval')

train_sess = tf.Session(graph=train_graph)
train_sess.run(initializer)

eval_sess = tf.Session(graph=eval_graph)
eval_sess.run(eval_model.mapping_table_init)

for epoch in range(epochs):
    train_sess.run(train_model.iterator_initializer)
    train_loss = []
    start = time.time()
    while True:
        try:
            _, cost = train_sess.run([train_model.update_step, train_model.loss], feed_dict={train_model.keep_prob: 0.6})
            train_loss.append(cost)
        except tf.errors.OutOfRangeError: # Finished going through the training dataset.  Go to next epoch. 
            end = time.time()
            log = "Epoch {}/{}, train_loss={:.3f}, time = {:.0f}s"
            print(log.format(epoch+1, epochs, np.mean(train_loss), end-start))
            
            checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=epoch+1)
            
            eval_model.saver.restore(eval_sess, checkpoint_path)
            eval_sess.run(eval_model.iterator_initializer)
            eval_loss = []
            eval_start = time.time()
            while True:
                try:
                    cost = eval_sess.run(eval_model.loss, feed_dict={eval_model.keep_prob: 1.0})
                    eval_loss.append(cost)
                except tf.errors.OutOfRangeError:
                    eval_end = time.time()
                    log = "\ttest_loss={:.3f}, time = {:.0f}s"
                    print(log.format(np.mean(eval_loss), eval_end-eval_start))
                    break
            
            break
train_sess.close()
eval_sess.close()

Epoch 1/100, train_loss=170.520, time = 188s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-1
	test_loss=109.421, time = 45s
Epoch 2/100, train_loss=82.074, time = 166s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-2
	test_loss=66.955, time = 29s
Epoch 3/100, train_loss=60.986, time = 164s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-3
	test_loss=58.391, time = 29s
Epoch 4/100, train_loss=52.970, time = 159s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-4
	test_loss=53.783, time = 29s
Epoch 5/100, train_loss=47.444, time = 159s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-5
	test_loss=49.679, time = 29s
Epoch 6/100, train_loss=42.965, time = 158s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-6
	test_loss=46.368, time = 29s
Epoch 7/100, train_loss=39.710, time = 156s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-7
	test_loss=44.237, time = 29s
Epoch 8/100, train_loss=3

KeyboardInterrupt: 

In [5]:
eval_train_graph = tf.Graph()
eval_test_graph = tf.Graph()

with eval_train_graph.as_default():
    eval_train_model = Model(batch_size*2, num_hidden, n_hidden_layer, feats_dim, num_classes, 
                       phn_61=phn_61, phn_39=phn_39, mapping=mapping, file_type=TRAIN_FILE, model_type='eval')
with eval_test_graph.as_default():
    eval_test_model = Model(batch_size*2, num_hidden, n_hidden_layer, feats_dim, num_classes, 
                       phn_61=phn_61, phn_39=phn_39, mapping=mapping, file_type=TEST_FILE, model_type='eval')
    
eval_train_sess = tf.Session(graph=eval_train_graph)
eval_test_sess = tf.Session(graph=eval_test_graph)

eval_train_sess.run(eval_train_model.mapping_table_init)
eval_test_sess.run(eval_test_model.mapping_table_init)

for saved_model_path in train_model.saver.last_checkpoints[-1:]:
    eval_train_model.saver.restore(eval_train_sess, saved_model_path)
    eval_train_sess.run(eval_train_model.iterator_initializer)
    eval_train_loss = []
    eval_train_per = []
    start = time.time()
    while True:
        try:
            cost, _per = eval_train_sess.run([eval_train_model.loss, eval_train_model.per], feed_dict={eval_train_model.keep_prob: 1.0 })
            eval_train_loss.append(cost)
            eval_train_per.append(_per)
        except tf.errors.OutOfRangeError:
            end = time.time()
            log = "{} train_loss={:.3f}, train_per={:.3f}, time = {:.0f}s"
            print(log.format(saved_model_path, np.mean(eval_train_loss), np.mean(eval_train_per), end-start))
            break
    eval_test_model.saver.restore(eval_test_sess, saved_model_path)
    eval_test_sess.run(eval_test_model.iterator_initializer)
    eval_test_loss = []
    eval_test_per = []
    start = time.time()
    while True:
        try:
            cost, _per = eval_test_sess.run([eval_test_model.loss, eval_test_model.per], feed_dict={eval_test_model.keep_prob: 1.0})
            eval_test_loss.append(cost)
            eval_test_per.append(_per)
        except tf.errors.OutOfRangeError:
            end = time.time()
            log = "{} test_loss={:.3f}, test_per={:.3f}, time = {:.0f}s"
            print(log.format(saved_model_path, np.mean(eval_test_loss), np.mean(eval_test_per), end-start))
            break

INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-12
./model/cnn+ctc/ckpt-12 train_loss=23.832, train_per=0.170, time = 260s
INFO:tensorflow:Restoring parameters from ./model/cnn+ctc/ckpt-12
./model/cnn+ctc/ckpt-12 test_loss=44.278, test_per=0.286, time = 88s


In [None]:
debug_graph = tf.Graph()
with debug_graph.as_default():
    debug_model = Model(batch_size*2, num_hidden, n_hidden_layer, feats_dim, num_classes, 
                       phn_61=phn_61, phn_39=phn_39, mapping=mapping, file_type=TEST_FILE, model_type='eval')
    initializer = tf.global_variables_initializer()

debug_sess = tf.Session(graph=debug_graph)
debug_sess.run(initializer)

#debug_model.saver.restore(debug_sess, train_model.saver.last_checkpoints[-1])
debug_sess.run(debug_model.iterator_initializer)
r, r2 = debug_sess.run([debug_model.d, debug_model.d2],feed_dict={debug_model.keep_prob: 1.0, debug_model.training: True})

In [None]:
r[0,10,:]

In [None]:
r2[0,20,:]

In [None]:
r.shape

In [None]:
t = tf.nn.softmax(r[0,3,:])
t.eval(session=tf.Session())

In [None]:
t = tf.nn.softmax([0.1,1000,0.1])
t.eval(session=tf.Session())