In [1]:
import numpy as np
import glob

In [2]:
spectrogram = glob.glob('spectrogram-train/*npy')
len(spectrogram)

8184

In [3]:
def filter_text(string):
    string = string.lower()
    splitted = string.split('/')[1].split('.')[0].replace('<>','-').split('-')
    splitted = [w for w in splitted if not w.isdigit() and w not in ['man', 'woman', 'augment']]
    return ' '.join(splitted)

filter_text(spectrogram[-1])

'tolong sebut pariahship'

In [4]:
train_X, train_Y = [], []
for spec in spectrogram:
    train_Y.append(filter_text(spec))
    train_X.append(np.load(spec))

In [5]:
spectrogram = glob.glob('spectrogram-test/*npy')
len(spectrogram)

293

In [6]:
test_X, test_Y = [], []
for spec in spectrogram:
    test_Y.append(filter_text(spec))
    test_X.append(np.load(spec))

In [7]:
import tensorflow as tf
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, dtype = 'float32', padding = 'post'
)

test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, dtype = 'float32', padding = 'post'
)

In [9]:
chars = list(set([c for target in train_Y + test_Y for c in target]))
num_classes = len(chars) + 2

idx2char = {idx + 1: char for idx, char in enumerate(chars)}
idx2char[0] = '<PAD>'
char2idx = {char: idx for idx, char in idx2char.items()}

In [10]:
train_Y = [[char2idx[c] for c in target] for target in train_Y]
test_Y = [[char2idx[c] for c in target] for target in test_Y]

In [11]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [12]:
settings = {
    "filter_width": 2,
    "sample_rate": 16000,
    "dilations": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
                  1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
                  1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
                  1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
                  1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
    "residual_channels": 32,
    "dilation_channels": 32,
    "quantization_channels": 256,
    "skip_channels": 512,
    "initial_filter_width": 32
}

In [13]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

class Model:
    def __init__(
        self,
        learning_rate,
        num_features,
        block_size = 128,
        dropout = 1.0,
    ):
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.Y = tf.sparse_placeholder(tf.int32)
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.X, -1), 1, dtype = tf.int32
        ) + 10
        filled = tf.fill(tf.shape(seq_lens), tf.shape(self.X)[1])
        seq_lens = tf.where(seq_lens > tf.shape(self.X)[1], filled, seq_lens)
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])

        def residual_block(x, size, rate, block):
            with tf.variable_scope('block_%d_%d' % (block, rate), reuse = False):
                conv_filter = tf.layers.conv1d(
                    x,
                    settings['residual_channels'],
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.tanh,
                )
                conv_gate = tf.layers.conv1d(
                    x,
                    settings['residual_channels'],
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.sigmoid,
                )
                out = tf.multiply(conv_filter, conv_gate)
                out = tf.layers.conv1d(
                    out,
                    settings['residual_channels'],
                    kernel_size = 1,
                    strides = 1,
                    padding = 'same',
                )
                return tf.add(x, out), out
            
        forward = tf.layers.conv1d(self.X, settings['residual_channels'], 
                                   kernel_size = 1, strides = 1, padding = 'SAME',
                                  activation = tf.nn.relu)
        zeros = tf.zeros_like(forward)
        for i, r in enumerate(settings['dilations']):
            forward, s = residual_block(forward, size=settings['filter_width'], 
                                        rate=r, block=i)
            zeros = tf.add(zeros,s)
        zeros = tf.nn.relu(zeros)
        logits = tf.layers.conv1d(zeros, zeros.shape[-1], kernel_size = 1, strides = 1, padding = 'SAME',
                                 activation = tf.nn.relu)
        logits = tf.layers.conv1d(logits, num_classes, kernel_size = 1, strides = 1, padding = 'SAME')
        time_major = tf.transpose(logits, [1, 0, 2])
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse_tensor_to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

learning_rate = 1e-3
batch_size = 128
epoch = 20

model = Model(learning_rate, train_X.shape[2])
sess.run(tf.global_variables_initializer())

W0818 00:27:51.045110 139708866160448 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0818 00:27:51.063123 139708866160448 deprecation.py:323] From <ipython-input-13-8ba3e1e3c3c9>:19: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0818 00:27:51.066738 139708866160448 deprecation.py:323] From <ipython-input-13-8ba3e1e3c3c9>:55: conv1d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
W0818 00:27:51.071345 139708866160448 deprecati

In [15]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 64/64 [02:01<00:00,  1.70s/it, accuracy=0.235, cost=45.6] 
testing minibatch loop: 100%|██████████| 3/3 [00:04<00:00,  1.77s/it, accuracy=0.0501, cost=55.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 1, training avg cost 73.757271, training avg accuracy 0.087161
epoch 1, testing avg cost 57.603886, testing avg accuracy 0.052210


minibatch loop: 100%|██████████| 64/64 [01:50<00:00,  1.70s/it, accuracy=0.519, cost=29.4]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it, accuracy=0.116, cost=59]   
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 2, training avg cost 36.712948, training avg accuracy 0.411255
epoch 2, testing avg cost 61.431828, testing avg accuracy 0.099589


minibatch loop: 100%|██████████| 64/64 [01:53<00:00,  1.73s/it, accuracy=0.55, cost=26.2] 
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it, accuracy=0.0941, cost=67.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 3, training avg cost 29.202019, training avg accuracy 0.519884
epoch 3, testing avg cost 69.686897, testing avg accuracy 0.087342


minibatch loop: 100%|██████████| 64/64 [01:53<00:00,  1.73s/it, accuracy=0.558, cost=24.7]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it, accuracy=0.0721, cost=72.9]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 4, training avg cost 26.836681, training avg accuracy 0.546231
epoch 4, testing avg cost 75.610390, testing avg accuracy 0.078453


minibatch loop: 100%|██████████| 64/64 [01:54<00:00,  1.78s/it, accuracy=0.565, cost=23.8]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.34s/it, accuracy=0.0807, cost=79.2]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 5, training avg cost 25.642921, training avg accuracy 0.556141
epoch 5, testing avg cost 81.497337, testing avg accuracy 0.077070


minibatch loop: 100%|██████████| 64/64 [01:54<00:00,  1.77s/it, accuracy=0.567, cost=23.5]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.33s/it, accuracy=0.0807, cost=83.3]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 6, training avg cost 24.932297, training avg accuracy 0.560447
epoch 6, testing avg cost 85.687080, testing avg accuracy 0.074502


minibatch loop: 100%|██████████| 64/64 [01:55<00:00,  1.79s/it, accuracy=0.573, cost=23.2]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.29s/it, accuracy=0.0795, cost=84.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 7, training avg cost 24.394341, training avg accuracy 0.562530
epoch 7, testing avg cost 87.431053, testing avg accuracy 0.075308


minibatch loop: 100%|██████████| 64/64 [01:55<00:00,  1.80s/it, accuracy=0.573, cost=22.8]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it, accuracy=0.0807, cost=86.7]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 8, training avg cost 23.890350, training avg accuracy 0.564559
epoch 8, testing avg cost 90.656105, testing avg accuracy 0.074279


minibatch loop: 100%|██████████| 64/64 [01:56<00:00,  1.78s/it, accuracy=0.573, cost=22.5]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it, accuracy=0.0844, cost=90.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 9, training avg cost 23.465965, training avg accuracy 0.566088
epoch 9, testing avg cost 94.360939, testing avg accuracy 0.075735


minibatch loop: 100%|██████████| 64/64 [01:56<00:00,  1.79s/it, accuracy=0.574, cost=22.2]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it, accuracy=0.0782, cost=87.6]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 10, training avg cost 23.255550, training avg accuracy 0.566200
epoch 10, testing avg cost 91.583313, testing avg accuracy 0.078466


minibatch loop: 100%|██████████| 64/64 [01:57<00:00,  1.80s/it, accuracy=0.572, cost=22]  
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.36s/it, accuracy=0.077, cost=84.6] 
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 11, training avg cost 22.857719, training avg accuracy 0.567013
epoch 11, testing avg cost 89.300804, testing avg accuracy 0.082295


minibatch loop: 100%|██████████| 64/64 [01:57<00:00,  1.81s/it, accuracy=0.573, cost=21.8]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.36s/it, accuracy=0.099, cost=88.3] 
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 12, training avg cost 22.409367, training avg accuracy 0.568516
epoch 12, testing avg cost 92.792778, testing avg accuracy 0.090520


minibatch loop: 100%|██████████| 64/64 [01:58<00:00,  1.83s/it, accuracy=0.575, cost=21.3]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.35s/it, accuracy=0.106, cost=88.3] 
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 13, training avg cost 22.042625, training avg accuracy 0.569338
epoch 13, testing avg cost 91.758759, testing avg accuracy 0.100547


minibatch loop: 100%|██████████| 64/64 [01:58<00:00,  1.81s/it, accuracy=0.576, cost=21.1]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.37s/it, accuracy=0.099, cost=85.6]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 14, training avg cost 21.742012, training avg accuracy 0.569931
epoch 14, testing avg cost 90.373497, testing avg accuracy 0.101324


minibatch loop: 100%|██████████| 64/64 [01:58<00:00,  1.81s/it, accuracy=0.574, cost=20.8]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.36s/it, accuracy=0.115, cost=86.7]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 15, training avg cost 21.358143, training avg accuracy 0.570922
epoch 15, testing avg cost 91.317894, testing avg accuracy 0.117449


minibatch loop: 100%|██████████| 64/64 [01:59<00:00,  1.84s/it, accuracy=0.573, cost=20.5]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.37s/it, accuracy=0.11, cost=89]  
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 16, training avg cost 20.918570, training avg accuracy 0.572405
epoch 16, testing avg cost 93.527657, testing avg accuracy 0.119690


minibatch loop: 100%|██████████| 64/64 [02:00<00:00,  1.86s/it, accuracy=0.577, cost=20.1]
testing minibatch loop: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it, accuracy=0.155, cost=89.2]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 17, training avg cost 20.517712, training avg accuracy 0.573615
epoch 17, testing avg cost 93.358742, testing avg accuracy 0.146370


minibatch loop: 100%|██████████| 64/64 [02:01<00:00,  1.86s/it, accuracy=0.576, cost=19.8]
testing minibatch loop: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it, accuracy=0.101, cost=98.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 18, training avg cost 20.398012, training avg accuracy 0.573949
epoch 18, testing avg cost 102.212433, testing avg accuracy 0.106286


minibatch loop: 100%|██████████| 64/64 [02:01<00:00,  1.86s/it, accuracy=0.576, cost=20.3]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.36s/it, accuracy=0.122, cost=88.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 19, training avg cost 20.530340, training avg accuracy 0.574338
epoch 19, testing avg cost 93.829498, testing avg accuracy 0.116883


minibatch loop: 100%|██████████| 64/64 [02:01<00:00,  1.90s/it, accuracy=0.587, cost=19.7]
testing minibatch loop: 100%|██████████| 3/3 [00:03<00:00,  1.37s/it, accuracy=0.0978, cost=88.8]

epoch 20, training avg cost 20.272179, training avg accuracy 0.573711
epoch 20, testing avg cost 95.344048, testing avg accuracy 0.102146





In [16]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: tolong sebut asternata
predicted: seb sebut ertiri
