In [1]:
import numpy as np
import os
import glob

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
# I have multi-gpus

In [3]:
spectrogram = glob.glob('spectrogram-train/*npy')
len(spectrogram)

8184

In [4]:
def filter_text(string):
    string = string.lower()
    splitted = string.split('/')[1].split('.')[0].replace('<>','-').split('-')
    splitted = [w for w in splitted if not w.isdigit() and w not in ['man', 'woman', 'augment']]
    return ' '.join(splitted)

filter_text(spectrogram[-1])

'tolong sebut pariahship'

In [5]:
train_X, train_Y = [], []
for spec in spectrogram:
    train_Y.append(filter_text(spec))
    train_X.append(np.load(spec))

In [6]:
spectrogram = glob.glob('spectrogram-test/*npy')
len(spectrogram)

293

In [7]:
test_X, test_Y = [], []
for spec in spectrogram:
    test_Y.append(filter_text(spec))
    test_X.append(np.load(spec))

In [8]:
import tensorflow as tf
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, dtype = 'float32', padding = 'post'
)

test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, dtype = 'float32', padding = 'post'
)

In [10]:
chars = list(set([c for target in train_Y + test_Y for c in target]))
num_classes = len(chars) + 2

idx2char = {idx + 1: char for idx, char in enumerate(chars)}
idx2char[0] = '<PAD>'
char2idx = {char: idx for idx, char in idx2char.items()}

In [11]:
train_Y = [[char2idx[c] for c in target] for target in train_Y]
test_Y = [[char2idx[c] for c in target] for target in test_Y]

In [12]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [13]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

_BATCH_NORM_EPSILON = 1e-5
_BATCH_NORM_DECAY = 0.997
_CONV_FILTERS = 32

def batch_norm(inputs, training):
    return tf.layers.batch_normalization(
      inputs=inputs, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON,
      fused=True, training=training)

def _conv_bn_layer(inputs, padding, filters, kernel_size, strides, layer_id,
                   training):
    inputs = tf.pad(
      inputs,
      [[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])
    inputs = tf.layers.conv2d(
      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
      padding="valid", use_bias=False, activation=tf.nn.relu6,
      name="cnn_{}".format(layer_id))
    return batch_norm(inputs, training)

def _rnn_layer(inputs, rnn_cell, rnn_hidden_size, layer_id, is_batch_norm,
               is_bidirectional, training):
    if is_batch_norm:
        inputs = batch_norm(inputs, training)
    
    fw_cell = rnn_cell(num_units=rnn_hidden_size,
                     name="rnn_fw_{}".format(layer_id))
    bw_cell = rnn_cell(num_units=rnn_hidden_size,
                     name="rnn_bw_{}".format(layer_id))

    if is_bidirectional:
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=fw_cell, cell_bw=bw_cell, inputs=inputs, dtype=tf.float32,
        swap_memory=True)
        rnn_outputs = tf.concat(outputs, -1)
    else:
        rnn_outputs = tf.nn.dynamic_rnn(
        fw_cell, inputs, dtype=tf.float32, swap_memory=True)

    return rnn_outputs

class Model:
    def __init__(
        self,
        size_layers,
        learning_rate,
        num_features,
        dropout = 1.0,
    ):
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.training = tf.placeholder(tf.bool, None)
        self.Y = tf.sparse_placeholder(tf.int32)
        x = tf.expand_dims(self.X, -1)

        inputs = _conv_bn_layer(
            x, padding=(20, 5), filters=_CONV_FILTERS, kernel_size=(41, 11),
            strides=(2, 2), layer_id=1, training=self.training)
        
        inputs = _conv_bn_layer(
            inputs, padding=(10, 5), filters=_CONV_FILTERS, kernel_size=(21, 11),
            strides=(2, 1), layer_id=2, training=self.training)
        
        batch_size = tf.shape(inputs)[0]
        feat_size = inputs.get_shape().as_list()[2]
        inputs = tf.reshape(
            inputs,
            [batch_size, -1, feat_size * _CONV_FILTERS // 4])
        print(inputs)
        
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(inputs, -1), 1, dtype = tf.int32
        ) + 30
        filled = tf.fill(tf.shape(seq_lens), tf.shape(inputs)[1])
        seq_lens = tf.where(seq_lens > tf.shape(inputs)[1], filled, seq_lens)
        
        rnn_cell = tf.nn.rnn_cell.GRUCell
        for layer_counter in range(5):
            is_batch_norm = (layer_counter != 0)
            inputs = _rnn_layer(
              inputs, rnn_cell, size_layers, layer_counter + 1,
              is_batch_norm, True, self.training)
        

        logits = tf.layers.dense(inputs, num_classes)
        self.logits = logits
        time_major = tf.transpose(logits, [1, 0, 2])
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse_tensor_to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 512
learning_rate = 1e-3
num_layers = 2
batch_size = 128
epoch = 20

model = Model(size_layers, learning_rate, train_X[0].shape[1])
sess.run(tf.global_variables_initializer())

W0818 00:21:09.685058 140441058170688 deprecation.py:323] From <ipython-input-13-82342b2245c7>:22: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
W0818 00:21:09.689858 140441058170688 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0818 00:21:09.915712 140441058170688 deprecation.py:323] From <ipython-input-13-82342b2245c7>:12: batch_normalization (from tensorflow.python.layers.normalization) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_de

Tensor("Reshape:0", shape=(?, ?, 1600), dtype=float32)


W0818 00:21:11.819599 140441058170688 deprecation.py:323] From <ipython-input-13-82342b2245c7>:90: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0818 00:21:12.094319 140441058170688 deprecation.py:323] From <ipython-input-13-82342b2245c7>:94: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


In [15]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len,
                        model.training: True},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len,
                        model.training: True},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 64/64 [03:55<00:00,  3.63s/it, accuracy=0.507, cost=29.7] 
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.39s/it, accuracy=0.527, cost=29.7]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 1, training avg cost 56.826046, training avg accuracy 0.291741
epoch 1, testing avg cost 33.655415, testing avg accuracy 0.528531


minibatch loop: 100%|██████████| 64/64 [03:55<00:00,  3.63s/it, accuracy=0.571, cost=26.2]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.21s/it, accuracy=0.575, cost=28.2]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 2, training avg cost 28.403610, training avg accuracy 0.556301
epoch 2, testing avg cost 30.760641, testing avg accuracy 0.567929


minibatch loop: 100%|██████████| 64/64 [03:57<00:00,  3.67s/it, accuracy=0.568, cost=24.7]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it, accuracy=0.562, cost=27.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 3, training avg cost 26.134878, training avg accuracy 0.562667
epoch 3, testing avg cost 29.765184, testing avg accuracy 0.563527


minibatch loop: 100%|██████████| 64/64 [03:58<00:00,  3.66s/it, accuracy=0.57, cost=24.1] 
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.24s/it, accuracy=0.544, cost=27.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 4, training avg cost 25.135914, training avg accuracy 0.566498
epoch 4, testing avg cost 29.284140, testing avg accuracy 0.546186


minibatch loop: 100%|██████████| 64/64 [03:59<00:00,  3.69s/it, accuracy=0.572, cost=23.1]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.26s/it, accuracy=0.565, cost=28.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 5, training avg cost 24.079517, training avg accuracy 0.567874
epoch 5, testing avg cost 28.961710, testing avg accuracy 0.556997


minibatch loop: 100%|██████████| 64/64 [04:00<00:00,  3.71s/it, accuracy=0.573, cost=22.4]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.24s/it, accuracy=0.545, cost=26.8]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 6, training avg cost 23.452354, training avg accuracy 0.569967
epoch 6, testing avg cost 28.446762, testing avg accuracy 0.543706


minibatch loop: 100%|██████████| 64/64 [04:00<00:00,  3.70s/it, accuracy=0.576, cost=21.9]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.25s/it, accuracy=0.561, cost=28.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 7, training avg cost 23.057459, training avg accuracy 0.571221
epoch 7, testing avg cost 29.153875, testing avg accuracy 0.552550


minibatch loop: 100%|██████████| 64/64 [04:00<00:00,  3.69s/it, accuracy=0.572, cost=21.6]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.25s/it, accuracy=0.575, cost=28.6]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 8, training avg cost 22.712942, training avg accuracy 0.570491
epoch 8, testing avg cost 27.985113, testing avg accuracy 0.564273


minibatch loop: 100%|██████████| 64/64 [04:02<00:00,  3.73s/it, accuracy=0.574, cost=21.5]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.29s/it, accuracy=0.583, cost=27.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 9, training avg cost 21.943815, training avg accuracy 0.574517
epoch 9, testing avg cost 27.629715, testing avg accuracy 0.566216


minibatch loop: 100%|██████████| 64/64 [04:03<00:00,  3.75s/it, accuracy=0.587, cost=20.7]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it, accuracy=0.57, cost=24.5] 
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 10, training avg cost 21.678368, training avg accuracy 0.574352
epoch 10, testing avg cost 27.116419, testing avg accuracy 0.559392


minibatch loop: 100%|██████████| 64/64 [04:03<00:00,  3.71s/it, accuracy=0.578, cost=20.2]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.26s/it, accuracy=0.572, cost=27.6]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 11, training avg cost 21.170765, training avg accuracy 0.577402
epoch 11, testing avg cost 27.250391, testing avg accuracy 0.563014


minibatch loop: 100%|██████████| 64/64 [04:03<00:00,  3.77s/it, accuracy=0.59, cost=19.7] 
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.28s/it, accuracy=0.572, cost=27.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 12, training avg cost 20.354532, training avg accuracy 0.581516
epoch 12, testing avg cost 27.418417, testing avg accuracy 0.564796


minibatch loop: 100%|██████████| 64/64 [04:03<00:00,  3.77s/it, accuracy=0.593, cost=19.2]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it, accuracy=0.573, cost=28]  
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 13, training avg cost 19.635582, training avg accuracy 0.582762
epoch 13, testing avg cost 28.595886, testing avg accuracy 0.564175


minibatch loop: 100%|██████████| 64/64 [04:04<00:00,  3.74s/it, accuracy=0.595, cost=18.6]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it, accuracy=0.575, cost=27.9]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 14, training avg cost 19.300629, training avg accuracy 0.584145
epoch 14, testing avg cost 27.463984, testing avg accuracy 0.567382


minibatch loop: 100%|██████████| 64/64 [04:04<00:00,  3.75s/it, accuracy=0.597, cost=17.7]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it, accuracy=0.583, cost=26.9]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 15, training avg cost 18.425629, training avg accuracy 0.586362
epoch 15, testing avg cost 27.066568, testing avg accuracy 0.571014


minibatch loop: 100%|██████████| 64/64 [04:05<00:00,  3.78s/it, accuracy=0.595, cost=17.6]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.31s/it, accuracy=0.567, cost=28.8]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 16, training avg cost 17.594818, training avg accuracy 0.589440
epoch 16, testing avg cost 27.866278, testing avg accuracy 0.564799


minibatch loop: 100%|██████████| 64/64 [04:05<00:00,  3.76s/it, accuracy=0.602, cost=16.5]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it, accuracy=0.575, cost=26.7]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 17, training avg cost 17.199781, training avg accuracy 0.591143
epoch 17, testing avg cost 27.648123, testing avg accuracy 0.569038


minibatch loop: 100%|██████████| 64/64 [04:06<00:00,  3.78s/it, accuracy=0.602, cost=16.1]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it, accuracy=0.567, cost=26.1]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 18, training avg cost 16.425949, training avg accuracy 0.593123
epoch 18, testing avg cost 28.353882, testing avg accuracy 0.567812


minibatch loop: 100%|██████████| 64/64 [04:06<00:00,  3.77s/it, accuracy=0.609, cost=15.7]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.36s/it, accuracy=0.579, cost=28.5]
minibatch loop:   0%|          | 0/64 [00:00<?, ?it/s]

epoch 19, training avg cost 15.584755, training avg accuracy 0.596256
epoch 19, testing avg cost 28.705843, testing avg accuracy 0.570126


minibatch loop: 100%|██████████| 64/64 [04:07<00:00,  3.80s/it, accuracy=0.605, cost=14.5]
testing minibatch loop: 100%|██████████| 3/3 [00:06<00:00,  2.34s/it, accuracy=0.572, cost=24.7]

epoch 20, training avg cost 14.778521, training avg accuracy 0.599138
epoch 20, testing avg cost 28.008181, testing avg accuracy 0.565119





In [16]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x, model.training: False})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: sebut perkataan ambul
predicted: se
