In [1]:
import numpy as np
import os

In [2]:
spectrogram = os.listdir('spectrogram')
len(spectrogram)

17399

In [3]:
X, Y = [], []
for spec in spectrogram:
    Y.append(' '.join([i for i in os.path.splitext(spec)[0].split('-') if not i.isdigit()]))
    X.append(np.load('spectrogram/' + spec))

In [4]:
import tensorflow as tf
from tqdm import tqdm

In [None]:
chars = list(set([c for target in Y for c in target]))
num_classes = len(chars) + 2

idx2char = {idx + 1: char for idx, char in enumerate(chars)}
idx2char[0] = '<PAD>'
char2idx = {char: idx for idx, char in idx2char.items()}

Y = [[char2idx[c] for c in target] for target in Y]

In [None]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [None]:
tf.__version__

'1.10.0'

In [None]:
def pad_causal(x, size, rate):
    pad_len = (size - 1) * rate
    return tf.pad(x, [[0, 0], [pad_len, 0], [0, 0]])

def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

class Model:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate,
        num_features,
        num_blocks = 3,
        block_size = 128,
        dropout = 1.0,
    ):
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.Y = tf.sparse_placeholder(tf.int32)
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.X, -1), 1, dtype = tf.int32
        ) + 10
        filled = tf.fill(tf.shape(seq_lens), tf.shape(self.X)[1])
        seq_lens = tf.where(seq_lens > tf.shape(self.X)[1], filled, seq_lens)
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])

        def residual_block(x, size, rate, block):
            with tf.variable_scope('block_%d_%d' % (block, rate), reuse = False):
                conv_filter = tf.layers.conv1d(
                    x,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.tanh,
                )
                conv_gate = tf.layers.conv1d(
                    x,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.sigmoid,
                )
                out = tf.multiply(conv_filter, conv_gate)
                out = tf.layers.conv1d(
                    out,
                    block_size,
                    kernel_size = 1,
                    strides = 1,
                    padding = 'same',
                    activation = tf.nn.tanh,
                )
                return tf.add(x, out), out
        forward = tf.layers.conv1d(self.X, block_size, kernel_size = 1, strides = 1, padding = 'SAME')
        zeros = tf.zeros_like(forward)
        for i in range(num_blocks):
            for r in [1, 2, 4, 8, 16]:
                forward, s = residual_block(forward, size=7, rate=r, block=i)
                zeros = tf.add(zeros,s)
        logits = tf.layers.conv1d(zeros, num_classes, kernel_size = 1, strides = 1, padding = 'SAME')
        time_major = tf.transpose(logits, [1, 0, 2])
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse_tensor_to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 256
learning_rate = 1e-4
num_layers = 2
batch_size = 32
epoch = 50

model = Model(num_layers, size_layers, learning_rate, X[0].shape[1])
sess.run(tf.global_variables_initializer())

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X, Y)
del X

In [None]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    total_cost, total_accuracy = 0, 0
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        batch_x = tf.keras.preprocessing.sequence.pad_sequences(batch_x, dtype = 'float32', padding = 'post')
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        total_cost += cost
        total_accuracy += accuracy
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    total_cost /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    print('epoch %d, training average cost %f, training average accuracy %f'%(e + 1, total_cost, total_accuracy))
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    total_cost, total_accuracy = 0, 0
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        batch_x = tf.keras.preprocessing.sequence.pad_sequences(batch_x, dtype = 'float32', padding = 'post')
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        total_cost += cost
        total_accuracy += accuracy
        pbar.set_postfix(cost = cost, accuracy = accuracy)
        
    total_cost /= (len(test_X) / batch_size)
    total_accuracy /= (len(test_X) / batch_size)
    print('epoch %d, testing average cost %f, testing average accuracy %f'%(e + 1, total_cost, total_accuracy))

minibatch loop: 100%|██████████| 408/408 [03:38<00:00,  1.92it/s, accuracy=0.252, cost=51.1] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 1, training average cost 63.704233, training average accuracy 0.131253


testing minibatch loop: 100%|██████████| 136/136 [01:10<00:00,  1.88it/s, accuracy=0.303, cost=36.4]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 1, testing average cost 43.260290, testing average accuracy 0.290187


minibatch loop: 100%|██████████| 408/408 [03:39<00:00,  1.96it/s, accuracy=0.387, cost=41.6]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 2, training average cost 38.157898, training average accuracy 0.378563


testing minibatch loop: 100%|██████████| 136/136 [01:13<00:00,  1.90it/s, accuracy=0.425, cost=28.3]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 2, testing average cost 34.860609, testing average accuracy 0.418955


minibatch loop: 100%|██████████| 408/408 [03:45<00:00,  1.90it/s, accuracy=0.428, cost=37.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 3, training average cost 32.953661, training average accuracy 0.453282


testing minibatch loop: 100%|██████████| 136/136 [01:15<00:00,  1.85it/s, accuracy=0.517, cost=24.8]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 3, testing average cost 31.226677, testing average accuracy 0.465359


minibatch loop: 100%|██████████| 408/408 [03:50<00:00,  1.87it/s, accuracy=0.434, cost=33.4]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 4, training average cost 30.002567, training average accuracy 0.482901


testing minibatch loop: 100%|██████████| 136/136 [01:16<00:00,  1.81it/s, accuracy=0.524, cost=22.9]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 4, testing average cost 28.731859, testing average accuracy 0.488078


minibatch loop: 100%|██████████| 408/408 [03:54<00:00,  1.84it/s, accuracy=0.454, cost=31.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 5, training average cost 27.769928, training average accuracy 0.501716


testing minibatch loop: 100%|██████████| 136/136 [01:18<00:00,  1.77it/s, accuracy=0.553, cost=20.9]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 5, testing average cost 26.753767, testing average accuracy 0.508466


minibatch loop: 100%|██████████| 408/408 [03:56<00:00,  1.83it/s, accuracy=0.464, cost=29.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 6, training average cost 25.912681, training average accuracy 0.517164


testing minibatch loop: 100%|██████████| 136/136 [01:18<00:00,  1.77it/s, accuracy=0.589, cost=19.2]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 6, testing average cost 25.031035, testing average accuracy 0.523560


minibatch loop: 100%|██████████| 408/408 [04:00<00:00,  1.80it/s, accuracy=0.472, cost=26.5]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 7, training average cost 24.292728, training average accuracy 0.532367


testing minibatch loop: 100%|██████████| 136/136 [01:20<00:00,  1.74it/s, accuracy=0.599, cost=18.2]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 7, testing average cost 23.545396, testing average accuracy 0.535405


minibatch loop: 100%|██████████| 408/408 [04:04<00:00,  1.76it/s, accuracy=0.487, cost=25.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 8, training average cost 22.828858, training average accuracy 0.545901


testing minibatch loop: 100%|██████████| 136/136 [01:21<00:00,  1.71it/s, accuracy=0.594, cost=17]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 8, testing average cost 22.188690, testing average accuracy 0.547551


minibatch loop: 100%|██████████| 408/408 [04:08<00:00,  1.75it/s, accuracy=0.537, cost=23.6]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 9, training average cost 21.526225, training average accuracy 0.558248


testing minibatch loop: 100%|██████████| 136/136 [01:22<00:00,  1.68it/s, accuracy=0.628, cost=16]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 9, testing average cost 20.970053, testing average accuracy 0.561718


minibatch loop: 100%|██████████| 408/408 [04:11<00:00,  1.73it/s, accuracy=0.525, cost=22.2]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 10, training average cost 20.312274, training average accuracy 0.572059


testing minibatch loop: 100%|██████████| 136/136 [01:23<00:00,  1.68it/s, accuracy=0.626, cost=15]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 10, testing average cost 19.822101, testing average accuracy 0.574078


minibatch loop: 100%|██████████| 408/408 [04:13<00:00,  1.71it/s, accuracy=0.543, cost=21.2]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 11, training average cost 19.213326, training average accuracy 0.583404


testing minibatch loop: 100%|██████████| 136/136 [01:24<00:00,  1.66it/s, accuracy=0.641, cost=13.9]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 11, testing average cost 18.787493, testing average accuracy 0.584828


minibatch loop: 100%|██████████| 408/408 [04:16<00:00,  1.69it/s, accuracy=0.531, cost=20.4]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 12, training average cost 18.195799, training average accuracy 0.592464


testing minibatch loop: 100%|██████████| 136/136 [01:25<00:00,  1.64it/s, accuracy=0.661, cost=13]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 12, testing average cost 17.809989, testing average accuracy 0.593923


minibatch loop: 100%|██████████| 408/408 [04:18<00:00,  1.69it/s, accuracy=0.543, cost=19.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 13, training average cost 17.252686, training average accuracy 0.602320


testing minibatch loop: 100%|██████████| 136/136 [01:25<00:00,  1.63it/s, accuracy=0.668, cost=12.1]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 13, testing average cost 16.889668, testing average accuracy 0.604958


minibatch loop: 100%|██████████| 408/408 [04:19<00:00,  1.68it/s, accuracy=0.564, cost=18.3]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 14, training average cost 16.370016, training average accuracy 0.610656


testing minibatch loop: 100%|██████████| 136/136 [01:25<00:00,  1.63it/s, accuracy=0.691, cost=11.5]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 14, testing average cost 16.027165, testing average accuracy 0.613294


minibatch loop: 100%|██████████| 408/408 [04:21<00:00,  1.67it/s, accuracy=0.555, cost=17.2]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 15, training average cost 15.546713, training average accuracy 0.618359


testing minibatch loop: 100%|██████████| 136/136 [01:26<00:00,  1.61it/s, accuracy=0.687, cost=11]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 15, testing average cost 15.232227, testing average accuracy 0.618411


minibatch loop: 100%|██████████| 408/408 [04:23<00:00,  1.65it/s, accuracy=0.558, cost=16.1]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 16, training average cost 14.799888, training average accuracy 0.625644


testing minibatch loop: 100%|██████████| 136/136 [01:27<00:00,  1.60it/s, accuracy=0.703, cost=10.5]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 16, testing average cost 14.524589, testing average accuracy 0.624917


minibatch loop: 100%|██████████| 408/408 [04:24<00:00,  1.65it/s, accuracy=0.59, cost=15.4] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 17, training average cost 14.090364, training average accuracy 0.630748


testing minibatch loop: 100%|██████████| 136/136 [01:27<00:00,  1.58it/s, accuracy=0.699, cost=10.1]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 17, testing average cost 13.835726, testing average accuracy 0.630068


minibatch loop: 100%|██████████| 408/408 [04:26<00:00,  1.64it/s, accuracy=0.587, cost=14.5]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 18, training average cost 13.389068, training average accuracy 0.635353


testing minibatch loop: 100%|██████████| 136/136 [01:27<00:00,  1.58it/s, accuracy=0.705, cost=9.68]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 18, testing average cost 13.156154, testing average accuracy 0.635583


minibatch loop: 100%|██████████| 408/408 [04:27<00:00,  1.62it/s, accuracy=0.578, cost=13.8]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 19, training average cost 12.753446, training average accuracy 0.640851


testing minibatch loop: 100%|██████████| 136/136 [01:28<00:00,  1.57it/s, accuracy=0.695, cost=9.03]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 19, testing average cost 12.557222, testing average accuracy 0.638520


minibatch loop: 100%|██████████| 408/408 [04:28<00:00,  1.62it/s, accuracy=0.587, cost=12.8]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 20, training average cost 12.129370, training average accuracy 0.646369


testing minibatch loop: 100%|██████████| 136/136 [01:28<00:00,  1.55it/s, accuracy=0.714, cost=8.61]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 20, testing average cost 11.948158, testing average accuracy 0.643731


minibatch loop: 100%|██████████| 408/408 [04:29<00:00,  1.61it/s, accuracy=0.59, cost=11.8] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 21, training average cost 11.548273, training average accuracy 0.650572


testing minibatch loop: 100%|██████████| 136/136 [01:29<00:00,  1.56it/s, accuracy=0.712, cost=8.14]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 21, testing average cost 11.375299, testing average accuracy 0.648017


minibatch loop: 100%|██████████| 408/408 [04:30<00:00,  1.62it/s, accuracy=0.595, cost=11.5]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 22, training average cost 10.989844, training average accuracy 0.656106


testing minibatch loop: 100%|██████████| 136/136 [01:29<00:00,  1.55it/s, accuracy=0.712, cost=7.86]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 22, testing average cost 10.824283, testing average accuracy 0.652666


minibatch loop: 100%|██████████| 408/408 [04:31<00:00,  1.62it/s, accuracy=0.605, cost=10.3]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 23, training average cost 10.437272, training average accuracy 0.659412


testing minibatch loop: 100%|██████████| 136/136 [01:29<00:00,  1.55it/s, accuracy=0.716, cost=7.39]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 23, testing average cost 10.310511, testing average accuracy 0.656126


minibatch loop: 100%|██████████| 408/408 [04:32<00:00,  1.62it/s, accuracy=0.601, cost=10]  
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 24, training average cost 9.932191, training average accuracy 0.663587


testing minibatch loop: 100%|██████████| 136/136 [01:29<00:00,  1.55it/s, accuracy=0.709, cost=7.14]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 24, testing average cost 9.807833, testing average accuracy 0.659555


minibatch loop: 100%|██████████| 408/408 [04:33<00:00,  1.61it/s, accuracy=0.593, cost=9.19]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 25, training average cost 9.449029, training average accuracy 0.666897


testing minibatch loop: 100%|██████████| 136/136 [01:30<00:00,  1.55it/s, accuracy=0.716, cost=6.85]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 25, testing average cost 9.337638, testing average accuracy 0.662240


minibatch loop:  89%|████████▉ | 364/408 [04:04<00:28,  1.52it/s, accuracy=0.602, cost=9.35]

In [13]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: sebut perkataan man pengaktifan
predicted: sebut perkatan man aengaki
