In [1]:
import numpy as np
import glob

In [2]:
spectrogram = glob.glob('spectrogram-train/*npy')
len(spectrogram)

8184

In [3]:
def filter_text(string):
    string = string.lower()
    splitted = string.split('/')[1].split('.')[0].replace('<>','-').split('-')
    splitted = [w for w in splitted if not w.isdigit() and w not in ['man', 'woman', 'augment']]
    return ' '.join(splitted)

filter_text(spectrogram[-1])

'tolong sebut pariahship'

In [4]:
train_X, train_Y = [], []
for spec in spectrogram:
    train_Y.append(filter_text(spec))
    train_X.append(np.load(spec))

In [5]:
train_X[0].shape

(56, 400)

In [6]:
spectrogram = glob.glob('spectrogram-test/*npy')
len(spectrogram)

293

In [7]:
test_X, test_Y = [], []
for spec in spectrogram:
    test_Y.append(filter_text(spec))
    test_X.append(np.load(spec))

In [8]:
import tensorflow as tf
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, maxlen = 150, dtype = 'float32', padding = 'post'
)

test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, maxlen = 150, dtype = 'float32', padding = 'post'
)

In [10]:
chars = list(set([c for target in train_Y + test_Y for c in target]))

idx2char = {idx + 4: char for idx, char in enumerate(chars)}
idx2char[0] = '<PAD>'
idx2char[1] = '<GO>'
idx2char[2] = '<EOS>'
idx2char[3] = '<UNK>'
char2idx = {char: idx for idx, char in idx2char.items()}

In [11]:
idx2char

{4: 'n',
 5: 'c',
 6: 'r',
 7: 'a',
 8: 'j',
 9: 'd',
 10: 'k',
 11: 's',
 12: 'q',
 13: 'm',
 14: 'y',
 15: 'h',
 16: 'u',
 17: 'x',
 18: 'o',
 19: 't',
 20: ' ',
 21: 'f',
 22: 'l',
 23: 'w',
 24: 'i',
 25: 'e',
 26: 'p',
 27: 'g',
 28: 'z',
 29: 'b',
 30: 'v',
 0: '<PAD>',
 1: '<GO>',
 2: '<EOS>',
 3: '<UNK>'}

In [12]:
train_Y = [[char2idx[c] for c in target] + [2] for target in train_Y]
test_Y = [[char2idx[c] for c in target] + [2] for target in test_Y]

In [13]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X.shape, test_X.shape

((8184, 150, 400), (293, 150, 400))

In [15]:
# https://github.com/guillaumegenthial/im2latex/blob/master/model/components/attention_mechanism.py

class AttentionMechanism(object):
    """Class to compute attention over an image"""

    def __init__(self, img, dim_e, tiles=1):
        """Stores the image under the right shape.
        We loose the H, W dimensions and merge them into a single
        dimension that corresponds to "regions" of the image.
        Args:
            img: (tf.Tensor) image
            dim_e: (int) dimension of the intermediary vector used to
                compute attention
            tiles: (int) default 1, input to context h may have size
                    (tile * batch_size, ...)
        """
        if len(img.shape) == 2:
            self._img = img
        elif len(img.shape) == 3:
            N    = tf.shape(img)[0]
            H    = tf.shape(img)[1]
            C    = img.shape[2].value
            self._img = tf.reshape(img, shape=[N, H, C])
        else:
            print("Image shape not supported")
            raise NotImplementedError

        # dimensions
        self._n_regions  = tf.shape(self._img)[1]
        self._n_channels = self._img.shape[2].value
        self._dim_e      = dim_e
        self._tiles      = tiles
        self._scope_name = "att_mechanism"

        # attention vector over the image
        self._att_img = tf.layers.dense(
            inputs=self._img,
            units=self._dim_e,
            use_bias=False,
            name="att_img")


    def context(self, h):
        """Computes attention
        Args:
            h: (batch_size, num_units) hidden state
        Returns:
            c: (batch_size, channels) context vector
        """
        with tf.variable_scope(self._scope_name):
            if self._tiles > 1:
                att_img = tf.expand_dims(self._att_img, axis=1)
                att_img = tf.tile(att_img, multiples=[1, self._tiles, 1, 1])
                att_img = tf.reshape(att_img, shape=[-1, self._n_regions,
                        self._dim_e])
                img = tf.expand_dims(self._img, axis=1)
                img = tf.tile(img, multiples=[1, self._tiles, 1, 1])
                img = tf.reshape(img, shape=[-1, self._n_regions,
                        self._n_channels])
            else:
                att_img = self._att_img
                img     = self._img

            # computes attention over the hidden vector
            att_h = tf.layers.dense(inputs=h, units=self._dim_e, use_bias=False)

            # sums the two contributions
            att_h = tf.expand_dims(att_h, axis=1)
            att = tf.tanh(att_img + att_h)

            # computes scalar product with beta vector
            # works faster with a matmul than with a * and a tf.reduce_sum
            att_beta = tf.get_variable("att_beta", shape=[self._dim_e, 1],
                    dtype=tf.float32)
            att_flat = tf.reshape(att, shape=[-1, self._dim_e])
            e = tf.matmul(att_flat, att_beta)
            e = tf.reshape(e, shape=[-1, self._n_regions])

            # compute weights
            a = tf.nn.softmax(e)
            a = tf.expand_dims(a, axis=-1)
            c = tf.reduce_sum(a * img, axis=1)

            return c


    def initial_cell_state(self, cell):
        """Returns initial state of a cell computed from the image
        Assumes cell.state_type is an instance of named_tuple.
        Ex: LSTMStateTuple
        Args:
            cell: (instance of RNNCell) must define _state_size
        """
        _states_0 = []
        for hidden_name in cell._state_size._fields:
            hidden_dim = getattr(cell._state_size, hidden_name)
            h = self.initial_state(hidden_name, hidden_dim)
            _states_0.append(h)

        initial_state_cell = type(cell.state_size)(*_states_0)

        return initial_state_cell


    def initial_state(self, name, dim):
        """Returns initial state of dimension specified by dim"""
        with tf.variable_scope(self._scope_name):
            img_mean = tf.reduce_mean(self._img, axis=1)
            W = tf.get_variable("W_{}_0".format(name), shape=[self._n_channels,
                    dim])
            b = tf.get_variable("b_{}_0".format(name), shape=[dim])
            h = tf.tanh(tf.matmul(img_mean, W) + b)

            return h

In [16]:
# https://github.com/guillaumegenthial/im2latex/blob/master/model/components/attention_cell.py

import collections
from tensorflow.contrib.rnn import RNNCell, LSTMStateTuple


AttentionState = collections.namedtuple("AttentionState", ("cell_state", "o"))


class AttentionCell(RNNCell):
    def __init__(self, cell, attention_mechanism, dropout, dim_e,
                 dim_o, num_units,
        num_proj, dtype=tf.float32):
        """
        Args:
            cell: (RNNCell)
            attention_mechanism: (AttentionMechanism)
            dropout: (tf.float)
            attn_cell_config: (dict) hyper params
        """
        # variables and tensors
        self._cell                = cell
        self._attention_mechanism = attention_mechanism
        self._dropout             = dropout

        # hyperparameters and shapes
        self._n_channels     = self._attention_mechanism._n_channels
        self._dim_e          = dim_e
        self._dim_o          = dim_o
        self._num_units      = num_units
        self._num_proj       = num_proj
        self._dtype          = dtype

        # for RNNCell
        self._state_size = AttentionState(self._cell._state_size, self._dim_o)


    @property
    def state_size(self):
        return self._state_size


    @property
    def output_size(self):
        return self._num_proj


    @property
    def output_dtype(self):
        return self._dtype


    def initial_state(self):
        """Returns initial state for the lstm"""
        initial_cell_state = self._attention_mechanism.initial_cell_state(self._cell)
        initial_o          = self._attention_mechanism.initial_state("o", self._dim_o)

        return AttentionState(initial_cell_state, initial_o)


    def step(self, embedding, attn_cell_state):
        """
        Args:
            embedding: shape = (batch_size, dim_embeddings) embeddings
                from previous time step
            attn_cell_state: (AttentionState) state from previous time step
        """
        prev_cell_state, o = attn_cell_state

        scope = tf.get_variable_scope()
        with tf.variable_scope(scope):
            # compute new h
            x                     = tf.concat([embedding, o], axis=-1)
            new_h, new_cell_state = self._cell.__call__(x, prev_cell_state)
            new_h = tf.nn.dropout(new_h, self._dropout)

            # compute attention
            c = self._attention_mechanism.context(new_h)

            # compute o
            o_W_c = tf.get_variable("o_W_c", dtype=tf.float32,
                    shape=(self._n_channels, self._dim_o))
            o_W_h = tf.get_variable("o_W_h", dtype=tf.float32,
                    shape=(self._num_units, self._dim_o))

            new_o = tf.tanh(tf.matmul(new_h, o_W_h) + tf.matmul(c, o_W_c))
            new_o = tf.nn.dropout(new_o, self._dropout)

            y_W_o = tf.get_variable("y_W_o", dtype=tf.float32,
                    shape=(self._dim_o, self._num_proj))
            logits = tf.matmul(new_o, y_W_o)

            # new Attn cell state
            new_state = AttentionState(new_cell_state, new_o)

            return logits, new_state


    def __call__(self, inputs, state):
        """
        Args:
            inputs: the embedding of the previous word for training only
            state: (AttentionState) (h, o) where h is the hidden state and
                o is the vector used to make the prediction of
                the previous word
        """
        new_output, new_state = self.step(inputs, state)

        return (new_output, new_state)

In [17]:
from __future__ import division
import math
import numpy as np
from six.moves import xrange
import tensorflow as tf


# taken from https://github.com/tensorflow/tensor2tensor/blob/37465a1759e278e8f073cd04cd9b4fe377d3c740/tensor2tensor/layers/common_attention.py

# taken from https://raw.githubusercontent.com/guillaumegenthial/im2latex/master/model/components/positional.py

def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
    """Adds a bunch of sinusoids of different frequencies to a Tensor.

    Each channel of the input Tensor is incremented by a sinusoid of a difft
    frequency and phase in one of the positional dimensions.

    This allows attention to learn to use absolute and relative positions.
    Timing signals should be added to some precursors of both the query and the
    memory inputs to attention.

    The use of relative position is possible because sin(a+b) and cos(a+b) can
    be experessed in terms of b, sin(a) and cos(a).

    x is a Tensor with n "positional" dimensions, e.g. one dimension for a
    sequence or two dimensions for an image

    We use a geometric sequence of timescales starting with
    min_timescale and ending with max_timescale.  The number of different
    timescales is equal to channels // (n * 2). For each timescale, we
    generate the two sinusoidal signals sin(timestep/timescale) and
    cos(timestep/timescale).  All of these sinusoids are concatenated in
    the channels dimension.

    Args:
        x: a Tensor with shape [batch, d1 ... dn, channels]
        min_timescale: a float
        max_timescale: a float

    Returns:
        a Tensor the same shape as x.

    """
    static_shape = x.get_shape().as_list()
    num_dims = len(static_shape) - 2
    channels = tf.shape(x)[-1]
    num_timescales = channels // (num_dims * 2)
    log_timescale_increment = (
            math.log(float(max_timescale) / float(min_timescale)) /
            (tf.to_float(num_timescales) - 1))
    inv_timescales = min_timescale * tf.exp(
            tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
    for dim in xrange(num_dims):
        length = tf.shape(x)[dim + 1]
        position = tf.to_float(tf.range(length))
        scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
                inv_timescales, 0)
        signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
        prepad = dim * 2 * num_timescales
        postpad = channels - (dim + 1) * 2 * num_timescales
        signal = tf.pad(signal, [[0, 0], [prepad, postpad]])
        for _ in xrange(1 + dim):
            signal = tf.expand_dims(signal, 0)
        for _ in xrange(num_dims - 1 - dim):
            signal = tf.expand_dims(signal, -2)
        x += signal
    return x

In [18]:
attention_size = 256
size_layer = 256
embedded_size = 256
beam_width = 15
learning_rate = 1e-3

In [19]:
GO = 1
PAD = 0
EOS = 2

In [24]:
# CNN part I took from https://github.com/guillaumegenthial/im2latex/blob/master/model/encoder.py
# I use tf.contrib.seq2seq as decoder part

class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, shape=(None, 150, 400))
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        x_len = tf.shape(self.X)[1] // 2
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        decoder_embeddings = tf.Variable(tf.random_uniform([len(idx2char), embedded_size], -1, 1))
        
        img = self.X
        
        out = tf.layers.conv1d(img, 64, 3, 1, "SAME",
                activation=tf.nn.relu)
        out = tf.layers.max_pooling1d(out, 2, 2, "SAME")

        out = tf.layers.conv1d(out, 128, 3, 1, "SAME",
                activation=tf.nn.relu)
        out = tf.layers.max_pooling1d(out, 2, 2, "SAME")

        out = tf.layers.conv1d(out, 256, 3, 1, "SAME",
                activation=tf.nn.relu)

        out = tf.layers.conv1d(out, 256, 3, 1, "SAME",
                activation=tf.nn.relu)
        out = tf.layers.max_pooling1d(out, 2, 2, "SAME")
        out = tf.layers.conv1d(out, 512, 3, 1, "SAME",
                activation=tf.nn.relu)
        out = tf.layers.max_pooling1d(out, 1, 1, "SAME")
        out = tf.layers.conv1d(out, 512, 3, 1, "VALID",
                activation=tf.nn.relu)
        img = add_timing_signal_nd(out)
        print(img)
        
        with tf.variable_scope("attn_cell", reuse=False):
            attn_meca = AttentionMechanism(img, attention_size)
            recu_cell = tf.nn.rnn_cell.LSTMCell(size_layer)
            attn_cell = AttentionCell(recu_cell, attn_meca, 1.0,
                        attention_size, attention_size, size_layer, len(idx2char))

            encoder_state = attn_cell.initial_state()

            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                    sequence_length = self.Y_seq_len,
                    embedding = decoder_embeddings,
                    sampling_probability = 0.5,
                    time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell = attn_cell,
                    helper = training_helper,
                    initial_state = encoder_state,
                    output_layer = None)
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = training_decoder,
                    impute_finished = True,
                    maximum_iterations = tf.reduce_max(self.Y_seq_len))
        
        with tf.variable_scope("attn_cell", reuse=True):
            attn_meca = AttentionMechanism(img, attention_size, tiles=beam_width)
            recu_cell = tf.nn.rnn_cell.LSTMCell(size_layer, reuse = True)
            attn_cell = AttentionCell(recu_cell, attn_meca, 1.0,
                        attention_size, attention_size, size_layer, len(idx2char))
            
            encoder_state = attn_cell.initial_state()
            
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = attn_cell,
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width),
                beam_width = beam_width,
                output_layer = None,
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = x_len)
            
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = predicting_decoder_output.predicted_ids
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [25]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())

W0825 11:10:06.413872 140316960016192 deprecation.py:323] From <ipython-input-17-b2dd412390f9>:50: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
W0825 11:10:06.456164 140316960016192 deprecation.py:323] From <ipython-input-15-105cb114c84a>:40: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.


Tensor("add:0", shape=(?, 17, 512), dtype=float32)


W0825 11:10:06.727786 140316960016192 deprecation.py:323] From <ipython-input-24-e0bbc207c3cd>:42: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0825 11:10:06.906118 140316960016192 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py:961: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0825 11:10:07.590789 140316960016192 deprecation.py:506] From <ipython-input-16-d84cd8088212>:75: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please u

In [26]:
epoch = 20
batch_size = 32

In [27]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i : index]
        y = train_Y[i : index]
        batch_y, _ = pad_sentence_batch(y, 0)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = train_X[i : index]
        y = test_Y[i : index]
        batch_y, _ = pad_sentence_batch(y, 0)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 256/256 [00:36<00:00,  5.27it/s, accuracy=0.687, cost=1.02] 
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 11.95it/s, accuracy=0.515, cost=1.75]
minibatch loop:   0%|          | 1/256 [00:00<00:36,  6.98it/s, accuracy=0.692, cost=1.07]

epoch 1, training avg loss 1.196397, training avg acc 0.650139
epoch 1, testing avg loss 1.544306, testing avg acc 0.590845


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.86it/s, accuracy=0.719, cost=0.985]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.70it/s, accuracy=0.545, cost=1.68]
minibatch loop:   0%|          | 1/256 [00:00<00:37,  6.84it/s, accuracy=0.712, cost=1.01]

epoch 2, training avg loss 0.943901, training avg acc 0.720767
epoch 2, testing avg loss 1.579947, testing avg acc 0.586623


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.45it/s, accuracy=0.709, cost=0.981]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.51it/s, accuracy=0.463, cost=2.34]
minibatch loop:   0%|          | 1/256 [00:00<00:34,  7.48it/s, accuracy=0.699, cost=1.02]

epoch 3, training avg loss 0.915839, training avg acc 0.727767
epoch 3, testing avg loss 1.698179, testing avg acc 0.582225


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.71it/s, accuracy=0.712, cost=0.949]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.81it/s, accuracy=0.418, cost=2.75]
minibatch loop:   0%|          | 1/256 [00:00<00:32,  7.85it/s, accuracy=0.706, cost=0.994]

epoch 4, training avg loss 0.893956, training avg acc 0.734067
epoch 4, testing avg loss 1.846647, testing avg acc 0.573757


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.57it/s, accuracy=0.726, cost=0.901]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.59it/s, accuracy=0.463, cost=2.43]
minibatch loop:   0%|          | 1/256 [00:00<00:31,  8.18it/s, accuracy=0.692, cost=0.987]

epoch 5, training avg loss 0.878862, training avg acc 0.738105
epoch 5, testing avg loss 1.866804, testing avg acc 0.564869


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.39it/s, accuracy=0.724, cost=0.923]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.86it/s, accuracy=0.567, cost=1.71]
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.72it/s, accuracy=0.715, cost=0.941]

epoch 6, training avg loss 0.863553, training avg acc 0.741698
epoch 6, testing avg loss 1.823491, testing avg acc 0.577541


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.73it/s, accuracy=0.736, cost=0.874]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 14.95it/s, accuracy=0.463, cost=2.19]
minibatch loop:   0%|          | 1/256 [00:00<00:28,  8.88it/s, accuracy=0.716, cost=0.963]

epoch 7, training avg loss 0.843804, training avg acc 0.747745
epoch 7, testing avg loss 1.879867, testing avg acc 0.572658


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.77it/s, accuracy=0.748, cost=0.868]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.75it/s, accuracy=0.575, cost=1.79]
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.60it/s, accuracy=0.719, cost=0.917]

epoch 8, training avg loss 0.826598, training avg acc 0.751882
epoch 8, testing avg loss 1.819068, testing avg acc 0.585992


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.52it/s, accuracy=0.741, cost=0.843]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.48it/s, accuracy=0.515, cost=1.94]
minibatch loop:   0%|          | 1/256 [00:00<00:32,  7.96it/s, accuracy=0.72, cost=0.892]

epoch 9, training avg loss 0.806639, training avg acc 0.757633
epoch 9, testing avg loss 1.930075, testing avg acc 0.581690


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.67it/s, accuracy=0.761, cost=0.805]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.41it/s, accuracy=0.552, cost=1.9]
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.51it/s, accuracy=0.72, cost=0.929]

epoch 10, training avg loss 0.789566, training avg acc 0.763155
epoch 10, testing avg loss 1.954740, testing avg acc 0.576721


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.30it/s, accuracy=0.748, cost=0.877]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.11it/s, accuracy=0.396, cost=2.81]
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.64it/s, accuracy=0.718, cost=0.925]

epoch 11, training avg loss 0.781211, training avg acc 0.764982
epoch 11, testing avg loss 2.042783, testing avg acc 0.563247


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.62it/s, accuracy=0.746, cost=0.833]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.31it/s, accuracy=0.53, cost=2.03]
minibatch loop:   0%|          | 1/256 [00:00<00:31,  7.98it/s, accuracy=0.72, cost=0.894]

epoch 12, training avg loss 0.753856, training avg acc 0.772853
epoch 12, testing avg loss 2.023567, testing avg acc 0.577558


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.64it/s, accuracy=0.775, cost=0.788]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 14.99it/s, accuracy=0.485, cost=2.79]
minibatch loop:   0%|          | 1/256 [00:00<00:30,  8.26it/s, accuracy=0.728, cost=0.87]

epoch 13, training avg loss 0.734312, training avg acc 0.778937
epoch 13, testing avg loss 2.124653, testing avg acc 0.575388


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.56it/s, accuracy=0.755, cost=0.827]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 16.06it/s, accuracy=0.448, cost=2.64]
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.60it/s, accuracy=0.744, cost=0.842]

epoch 14, training avg loss 0.713604, training avg acc 0.784708
epoch 14, testing avg loss 2.136160, testing avg acc 0.568153


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.35it/s, accuracy=0.772, cost=0.794]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.75it/s, accuracy=0.552, cost=1.75]
minibatch loop:   0%|          | 1/256 [00:00<00:32,  7.83it/s, accuracy=0.752, cost=0.819]

epoch 15, training avg loss 0.699374, training avg acc 0.788987
epoch 15, testing avg loss 2.015028, testing avg acc 0.577803


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.75it/s, accuracy=0.768, cost=0.742]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.81it/s, accuracy=0.545, cost=2.27]
minibatch loop:   0%|          | 1/256 [00:00<00:37,  6.76it/s, accuracy=0.767, cost=0.79]

epoch 16, training avg loss 0.675886, training avg acc 0.795950
epoch 16, testing avg loss 2.261504, testing avg acc 0.562516


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.63it/s, accuracy=0.779, cost=0.741]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.47it/s, accuracy=0.455, cost=2.44]
minibatch loop:   0%|          | 1/256 [00:00<00:32,  7.95it/s, accuracy=0.754, cost=0.83]

epoch 17, training avg loss 0.655022, training avg acc 0.801809
epoch 17, testing avg loss 2.374006, testing avg acc 0.541985


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.61it/s, accuracy=0.79, cost=0.709] 
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.62it/s, accuracy=0.5, cost=2.29] 
minibatch loop:   0%|          | 1/256 [00:00<00:33,  7.72it/s, accuracy=0.758, cost=0.835]

epoch 18, training avg loss 0.635742, training avg acc 0.807895
epoch 18, testing avg loss 2.198693, testing avg acc 0.564262


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.27it/s, accuracy=0.813, cost=0.635]
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.65it/s, accuracy=0.522, cost=2.62]
minibatch loop:   0%|          | 1/256 [00:00<00:35,  7.28it/s, accuracy=0.793, cost=0.72]

epoch 19, training avg loss 0.617268, training avg acc 0.814121
epoch 19, testing avg loss 2.424324, testing avg acc 0.560308


minibatch loop: 100%|██████████| 256/256 [00:34<00:00,  7.75it/s, accuracy=0.811, cost=0.63] 
minibatch loop: 100%|██████████| 10/10 [00:00<00:00, 15.65it/s, accuracy=0.522, cost=2.28]

epoch 20, training avg loss 0.600661, training avg acc 0.819045
epoch 20, testing avg loss 2.161145, testing avg acc 0.578070



