In [1]:
import numpy as np
import os

In [2]:
spectrogram = os.listdir('spectrogram')
len(spectrogram)

17399

In [3]:
X, Y = [], []
for spec in spectrogram:
    Y.append(' '.join([i for i in os.path.splitext(spec)[0].split('-') if not i.isdigit()]))
    X.append(np.load('spectrogram/' + spec))

In [4]:
import tensorflow as tf
from tqdm import tqdm

In [5]:
X = tf.keras.preprocessing.sequence.pad_sequences(
    X, dtype = 'float32', padding = 'post'
)

chars = list(set([c for target in Y for c in target]))
num_classes = len(chars) + 2

idx2char = {idx + 1: char for idx, char in enumerate(chars)}
idx2char[0] = '<PAD>'
char2idx = {char: idx for idx, char in idx2char.items()}

Y = [[char2idx[c] for c in target] for target in Y]

In [6]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [7]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

class Model:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate,
        num_features,
        dropout = 1.0,
    ):
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.Y = tf.sparse_placeholder(tf.int32)
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.X, -1), 1, dtype = tf.int32
        ) + 10
        filled = tf.fill(tf.shape(seq_lens), tf.shape(self.X)[1])
        seq_lens = tf.where(seq_lens > tf.shape(self.X)[1], filled, seq_lens)

        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )
        features = self.X
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layers),
                cell_bw = cells(size_layers),
                inputs = features,
                sequence_length = seq_lens,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d' % (n),
            )
            features = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(features, num_classes)
        time_major = tf.transpose(logits, [1, 0, 2])
        self.time_major = time_major
        decoded, log_prob = tf.nn.ctc_greedy_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse.to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X, Y)
del X

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 256
learning_rate = 1e-3
num_layers = 2
batch_size = 32
epoch = 20

model = Model(num_layers, size_layers, learning_rate, train_X.shape[2])
sess.run(tf.global_variables_initializer())

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [10]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    total_cost, total_accuracy = 0, 0
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        total_cost += cost
        total_accuracy += accuracy
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    total_cost /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    print('epoch %d, training average cost %f, training average accuracy %f'%(e + 1, total_cost, total_accuracy))
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    total_cost, total_accuracy = 0, 0
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        total_cost += cost
        total_accuracy += accuracy
        pbar.set_postfix(cost = cost, accuracy = accuracy)
        
    total_cost /= (len(test_X) / batch_size)
    total_accuracy /= (len(test_X) / batch_size)
    print('epoch %d, testing average cost %f, testing average accuracy %f'%(e + 1, total_cost, total_accuracy))

minibatch loop: 100%|██████████| 408/408 [01:42<00:00,  4.30it/s, accuracy=0.639, cost=24]  
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 1, training average cost 35.114443, training average accuracy 0.437517


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.10it/s, accuracy=0.635, cost=26.3]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 1, testing average cost 25.218217, testing average accuracy 0.643782


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.28it/s, accuracy=0.716, cost=19.4]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 2, training average cost 22.899654, training average accuracy 0.672331


testing minibatch loop: 100%|██████████| 136/136 [00:34<00:00,  4.08it/s, accuracy=0.691, cost=20.1]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 2, testing average cost 20.412450, testing average accuracy 0.694773


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.25it/s, accuracy=0.766, cost=14.8]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 3, training average cost 18.182770, training average accuracy 0.716202


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.14it/s, accuracy=0.744, cost=15]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 3, testing average cost 15.626259, testing average accuracy 0.740581


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.26it/s, accuracy=0.79, cost=10.2] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 4, training average cost 13.836273, training average accuracy 0.762375


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.13it/s, accuracy=0.763, cost=12]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 4, testing average cost 12.328527, testing average accuracy 0.774494


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.22it/s, accuracy=0.827, cost=7.5] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 5, training average cost 10.894144, training average accuracy 0.800927


testing minibatch loop: 100%|██████████| 136/136 [00:34<00:00,  4.11it/s, accuracy=0.776, cost=10]  
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 5, testing average cost 9.536489, testing average accuracy 0.816837


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.25it/s, accuracy=0.894, cost=5.31]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 6, training average cost 8.849042, training average accuracy 0.828287


testing minibatch loop: 100%|██████████| 136/136 [00:34<00:00,  4.10it/s, accuracy=0.834, cost=8.54]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 6, testing average cost 7.799711, testing average accuracy 0.846244


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.24it/s, accuracy=0.884, cost=5.06]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 7, training average cost 7.151040, training average accuracy 0.855891


testing minibatch loop: 100%|██████████| 136/136 [00:34<00:00,  4.14it/s, accuracy=0.878, cost=6.26]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 7, testing average cost 6.492719, testing average accuracy 0.863009


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.29it/s, accuracy=0.914, cost=3.41]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 8, training average cost 5.922403, training average accuracy 0.872089


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.11it/s, accuracy=0.844, cost=4.63]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 8, testing average cost 5.174364, testing average accuracy 0.881609


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.30it/s, accuracy=0.938, cost=2.6] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 9, training average cost 4.920441, training average accuracy 0.886653


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.13it/s, accuracy=0.88, cost=4.24] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 9, testing average cost 4.580041, testing average accuracy 0.887560


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.31it/s, accuracy=0.921, cost=2.13]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 10, training average cost 4.161407, training average accuracy 0.894864


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.13it/s, accuracy=0.909, cost=3.13]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 10, testing average cost 3.537773, testing average accuracy 0.909963


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.24it/s, accuracy=0.946, cost=1.79]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 11, training average cost 3.585703, training average accuracy 0.906560


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.09it/s, accuracy=0.932, cost=2.13]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 11, testing average cost 3.183928, testing average accuracy 0.913407


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.23it/s, accuracy=0.963, cost=1.25]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 12, training average cost 3.145460, training average accuracy 0.911170


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.15it/s, accuracy=0.917, cost=1.99]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 12, testing average cost 2.762684, testing average accuracy 0.920866


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.28it/s, accuracy=0.948, cost=0.997]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 13, training average cost 2.571675, training average accuracy 0.925053


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.09it/s, accuracy=0.936, cost=2.1] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 13, testing average cost 2.185514, testing average accuracy 0.933096


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.32it/s, accuracy=0.945, cost=0.933]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 14, training average cost 2.353962, training average accuracy 0.927918


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.09it/s, accuracy=0.967, cost=1.38]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 14, testing average cost 2.308791, testing average accuracy 0.926613


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.31it/s, accuracy=0.978, cost=0.556]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 15, training average cost 2.083375, training average accuracy 0.935075


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.12it/s, accuracy=0.956, cost=1.34] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 15, testing average cost 1.793387, testing average accuracy 0.941247


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.31it/s, accuracy=0.936, cost=0.829]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 16, training average cost 1.986355, training average accuracy 0.935977


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.08it/s, accuracy=0.946, cost=2.84]
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 16, testing average cost 1.890359, testing average accuracy 0.937322


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.18it/s, accuracy=0.988, cost=0.521]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 17, training average cost 1.878024, training average accuracy 0.937826


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.14it/s, accuracy=0.929, cost=2.35] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 17, testing average cost 1.570755, testing average accuracy 0.947188


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.31it/s, accuracy=0.958, cost=1.09] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 18, training average cost 1.830582, training average accuracy 0.940515


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.09it/s, accuracy=0.933, cost=2.04] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 18, testing average cost 1.660526, testing average accuracy 0.940811


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.28it/s, accuracy=0.973, cost=0.497]
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 19, training average cost 1.603198, training average accuracy 0.945181


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.13it/s, accuracy=0.956, cost=1.29] 
minibatch loop:   0%|          | 0/408 [00:00<?, ?it/s]

epoch 19, testing average cost 1.342898, testing average accuracy 0.951278


minibatch loop: 100%|██████████| 408/408 [01:41<00:00,  4.31it/s, accuracy=0.998, cost=0.29] 
testing minibatch loop:   0%|          | 0/136 [00:00<?, ?it/s]

epoch 20, training average cost 1.207862, training average accuracy 0.955496


testing minibatch loop: 100%|██████████| 136/136 [00:33<00:00,  4.12it/s, accuracy=0.961, cost=0.952]

epoch 20, testing average cost 1.022993, testing average accuracy 0.959654





In [12]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: tolong sebut barbeku
predicted: tolong sebut bareku
