In [1]:
import json
import tensorflow as tf
import numpy as np
from sklearn.cross_validation import train_test_split



In [2]:
with open('dictionary.json') as fopen:
    x = json.load(fopen)
dictionary_english = x['english']
dictionary_bahasa = x['bahasa']

In [3]:
with open('english-malay.json') as fopen:
    x = json.load(fopen)
english = x[0]
bahasa = x[1]

In [4]:
GO = dictionary_english['dictionary']['GO']
PAD = dictionary_english['dictionary']['PAD']
EOS = dictionary_english['dictionary']['EOS']
UNK = dictionary_english['dictionary']['UNK']

In [5]:
from tqdm import tqdm

for i in tqdm(range(len(bahasa))):
    bahasa[i].append('EOS')

100%|██████████| 100000/100000 [00:00<00:00, 1107722.86it/s]


In [6]:
def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))

    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    
    outputs = gamma * normalized + beta
    return outputs

def multihead_attn(queries, keys, q_masks, k_masks, future_binding, num_units, num_heads):
    
    T_q = tf.shape(queries)[1]                                      
    T_k = tf.shape(keys)[1]                  

    Q = tf.layers.dense(queries, num_units, name='Q')                              
    K_V = tf.layers.dense(keys, 2*num_units, name='K_V')    
    K, V = tf.split(K_V, 2, -1)        

    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)                         
    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)                    
    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)                      

    align = tf.matmul(Q_, tf.transpose(K_, [0,2,1]))                      
    align = align / np.sqrt(K_.get_shape().as_list()[-1])                 

    paddings = tf.fill(tf.shape(align), 0.0)                   

    key_masks = k_masks                                                 
    key_masks = tf.tile(key_masks, [num_heads, 1])                       
    key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, T_q, 1])            
    align = tf.where(tf.equal(key_masks, 0), paddings, align)       

    if future_binding:
        lower_tri = tf.ones([T_q, T_k])                                          
        lower_tri = tf.linalg.LinearOperatorLowerTriangular(lower_tri).to_dense()  
        masks = tf.tile(tf.expand_dims(lower_tri,0), [tf.shape(align)[0], 1, 1]) 
        align = tf.where(tf.equal(masks, 0), paddings, align)

    align = tf.nn.softmax(align)                                            
    query_masks = tf.to_float(q_masks)                                             
    query_masks = tf.tile(query_masks, [num_heads, 1])                             
    query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, T_k])            
    align *= query_masks                                                           
          
    outputs = tf.matmul(align, V_)                                                 
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)             
    outputs += queries                                                             
    outputs = layer_norm(outputs)                                                 
    return outputs


def pointwise_feedforward(inputs, hidden_units, activation=None):
    outputs = tf.layers.dense(inputs, 4*hidden_units, activation=activation)
    outputs = tf.layers.dense(outputs, hidden_units, activation=None)
    outputs += inputs
    outputs = layer_norm(outputs)
    return outputs


def learned_position_encoding(inputs, mask, embed_dim):
    T = tf.shape(inputs)[1]
    outputs = tf.range(tf.shape(inputs)[1])                # (T_q)
    outputs = tf.expand_dims(outputs, 0)                   # (1, T_q)
    outputs = tf.tile(outputs, [tf.shape(inputs)[0], 1])   # (N, T_q)
    outputs = embed_seq(outputs, T, embed_dim, zero_pad=False, scale=False)
    return tf.expand_dims(tf.to_float(mask), -1) * outputs

def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)


def label_smoothing(inputs, epsilon=0.1):
    C = inputs.get_shape().as_list()[-1]
    return ((1 - epsilon) * inputs) + (epsilon / C)


class Model:
    def __init__(self, size_layer, embedded_size, from_dict_size, to_dict_size, learning_rate,
                 num_blocks = 4,
                 num_heads = 8,
                 min_freq = 50):
        self.X = tf.placeholder(tf.int32,[None,None])
        self.Y = tf.placeholder(tf.int32,[None,None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        def forward(x, y):
            encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, x)
            en_masks = tf.sign(x)
            encoder_embedded += sinusoidal_position_encoding(x, en_masks, embedded_size)
        
            for i in range(num_blocks):
                with tf.variable_scope('encoder_self_attn_%d'%i,reuse=tf.AUTO_REUSE):
                    encoder_embedded = multihead_attn(queries = encoder_embedded,
                                             keys = encoder_embedded,
                                             q_masks = en_masks,
                                             k_masks = en_masks,
                                             future_binding = False,
                                             num_units = size_layer,
                                             num_heads = num_heads)

                with tf.variable_scope('encoder_feedforward_%d'%i,reuse=tf.AUTO_REUSE):
                    encoder_embedded = pointwise_feedforward(encoder_embedded,
                                                    embedded_size,
                                                    activation = tf.nn.relu)
            
            decoder_embedded = tf.nn.embedding_lookup(decoder_embedding, y)
            de_masks = tf.sign(y)
            decoder_embedded += sinusoidal_position_encoding(y, de_masks, embedded_size)
            
            for i in range(num_blocks):
                with tf.variable_scope('decoder_self_attn_%d'%i,reuse=tf.AUTO_REUSE):
                    decoder_embedded = multihead_attn(queries = decoder_embedded,
                                         keys = decoder_embedded,
                                         q_masks = de_masks,
                                         k_masks = de_masks,
                                         future_binding = True,
                                         num_units = size_layer,
                                         num_heads = num_heads)
                
                with tf.variable_scope('decoder_attn_%d'%i,reuse=tf.AUTO_REUSE):
                    decoder_embedded = multihead_attn(queries = decoder_embedded,
                                         keys = encoder_embedded,
                                         q_masks = de_masks,
                                         k_masks = en_masks,
                                         future_binding = False,
                                         num_units = size_layer,
                                         num_heads = num_heads)
                
                with tf.variable_scope('decoder_feedforward_%d'%i,reuse=tf.AUTO_REUSE):
                    decoder_embedded = pointwise_feedforward(decoder_embedded,
                                                    embedded_size,
                                            activation = tf.nn.relu)
            
            return tf.layers.dense(decoder_embedded, to_dict_size, reuse=tf.AUTO_REUSE)
        
        self.training_logits = forward(self.X, decoder_input)
        
        def cond(i, y, temp):
            return i < 2 * tf.reduce_max(self.X_seq_len)
        
        def body(i, y, temp):
            logits = forward(self.X, y)
            ids = tf.argmax(logits, -1)[:, i]
            ids = tf.expand_dims(ids, -1)
            temp = tf.concat([temp[:, 1:], ids], -1)
            y = tf.concat([temp[:, -(i+1):], temp[:, :-(i+1)]], -1)
            y = tf.reshape(y, [tf.shape(temp)[0], 2 * tf.reduce_max(self.X_seq_len)])
            i += 1
            return i, y, temp
        
        target = tf.fill([batch_size, 2 * tf.reduce_max(self.X_seq_len)], GO)
        target = tf.cast(target, tf.int64)
        self.target = target
        
        _, self.predicting_ids, _ = tf.while_loop(cond, body, 
                                                  [tf.constant(0), target, target])
        self.logits = forward(self.X, self.Y)
        self.k = tf.placeholder(dtype = tf.int32)
        p = tf.nn.softmax(self.logits)
        self.topk_logprobs, self.topk_ids = tf.nn.top_k(tf.log(p), self.k)
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
embedded_size = 256
learning_rate = 1e-3
batch_size = 128

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, embedded_size, len(dictionary_english['dictionary']), 
                len(dictionary_bahasa['dictionary']), learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use keras.layers.dense instead.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.cast instead.


In [9]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

In [10]:
english = str_idx(english, dictionary_english['dictionary'])
bahasa = str_idx(bahasa, dictionary_bahasa['dictionary'])

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(english, bahasa, test_size = 0.2)

In [12]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [13]:
import time

for EPOCH in range(20):
    lasttime = time.time()

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x, _ = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, _ = pad_sentence_batch(train_Y[i : index], PAD)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, _ = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, _ = pad_sentence_batch(test_Y[i : index], PAD)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 625/625 [02:04<00:00,  5.23it/s, accuracy=0.091, cost=6.52] 
test minibatch loop: 100%|██████████| 157/157 [00:17<00:00,  9.16it/s, accuracy=0.085, cost=6.47] 
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.49it/s, accuracy=0.0861, cost=6.46]

time taken: 141.23740458488464
epoch: 0, training loss: 6.646932, training acc: 0.082438, valid loss: 6.543547, valid acc: 0.093060



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.26it/s, accuracy=0.0936, cost=6.46]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0866, cost=6.51]
train minibatch loop:   0%|          | 1/625 [00:00<01:51,  5.58it/s, accuracy=0.0892, cost=6.4]

time taken: 139.74103617668152
epoch: 1, training loss: 6.459286, training acc: 0.092726, valid loss: 6.484749, valid acc: 0.095872



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.23it/s, accuracy=0.0947, cost=6.43]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.37it/s, accuracy=0.0899, cost=6.48]
train minibatch loop:   0%|          | 1/625 [00:00<01:51,  5.61it/s, accuracy=0.0904, cost=6.37]

time taken: 139.78073906898499
epoch: 2, training loss: 6.410651, training acc: 0.094415, valid loss: 6.478459, valid acc: 0.096631



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.22it/s, accuracy=0.0947, cost=6.4] 
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.37it/s, accuracy=0.0915, cost=6.53]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.48it/s, accuracy=0.0923, cost=6.34]

time taken: 140.0757007598877
epoch: 3, training loss: 6.379970, training acc: 0.095188, valid loss: 6.477878, valid acc: 0.098065



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.24it/s, accuracy=0.0917, cost=6.41]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0441, cost=6.67]
train minibatch loop:   0%|          | 1/625 [00:00<01:54,  5.46it/s, accuracy=0.0908, cost=6.34]

time taken: 139.73952436447144
epoch: 4, training loss: 6.363216, training acc: 0.095421, valid loss: 6.495437, valid acc: 0.096802



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.25it/s, accuracy=0.0936, cost=6.38]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.067, cost=6.61] 
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.50it/s, accuracy=0.0923, cost=6.32]

time taken: 139.7378044128418
epoch: 5, training loss: 6.349439, training acc: 0.095410, valid loss: 6.503166, valid acc: 0.097425



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.25it/s, accuracy=0.0936, cost=6.35]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0572, cost=6.63]
train minibatch loop:   0%|          | 1/625 [00:00<01:54,  5.45it/s, accuracy=0.0931, cost=6.29]

time taken: 139.83592915534973
epoch: 6, training loss: 6.330181, training acc: 0.095922, valid loss: 6.508650, valid acc: 0.097557



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.26it/s, accuracy=0.0939, cost=6.33]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.38it/s, accuracy=0.0588, cost=6.64]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.51it/s, accuracy=0.0935, cost=6.27]

time taken: 139.81159448623657
epoch: 7, training loss: 6.319604, training acc: 0.095893, valid loss: 6.515944, valid acc: 0.097411



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.26it/s, accuracy=0.0939, cost=6.33]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.38it/s, accuracy=0.0605, cost=6.61]
train minibatch loop:   0%|          | 1/625 [00:00<01:52,  5.52it/s, accuracy=0.0915, cost=6.26]

time taken: 139.89637565612793
epoch: 8, training loss: 6.307637, training acc: 0.095973, valid loss: 6.509979, valid acc: 0.097483



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.24it/s, accuracy=0.0947, cost=6.32]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0637, cost=6.61]
train minibatch loop:   0%|          | 1/625 [00:00<01:52,  5.56it/s, accuracy=0.0919, cost=6.25]

time taken: 139.69912362098694
epoch: 9, training loss: 6.298974, training acc: 0.095948, valid loss: 6.517687, valid acc: 0.098058



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.25it/s, accuracy=0.0977, cost=6.28]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0637, cost=6.56]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.51it/s, accuracy=0.0927, cost=6.22]

time taken: 139.68699288368225
epoch: 10, training loss: 6.281744, training acc: 0.097146, valid loss: 6.493499, valid acc: 0.099063



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.28it/s, accuracy=0.101, cost=6.23] 
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0735, cost=6.46]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.48it/s, accuracy=0.102, cost=6.18]

time taken: 139.6750409603119
epoch: 11, training loss: 6.241626, training acc: 0.101898, valid loss: 6.437750, valid acc: 0.105504



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.28it/s, accuracy=0.102, cost=6.15] 
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.39it/s, accuracy=0.085, cost=6.36] 
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.49it/s, accuracy=0.107, cost=6.09]

time taken: 139.6926622390747
epoch: 12, training loss: 6.171450, training acc: 0.107569, valid loss: 6.346534, valid acc: 0.112303



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.23it/s, accuracy=0.115, cost=6]    
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.0964, cost=6.17]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.52it/s, accuracy=0.115, cost=5.95]

time taken: 139.71290850639343
epoch: 13, training loss: 6.045807, training acc: 0.113830, valid loss: 6.227268, valid acc: 0.120022



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.28it/s, accuracy=0.132, cost=5.77] 
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.38it/s, accuracy=0.111, cost=5.97]
train minibatch loop:   0%|          | 1/625 [00:00<01:52,  5.55it/s, accuracy=0.121, cost=5.76]

time taken: 139.73833870887756
epoch: 14, training loss: 5.878078, training acc: 0.123365, valid loss: 6.045022, valid acc: 0.131453



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.27it/s, accuracy=0.15, cost=5.6]   
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.39it/s, accuracy=0.136, cost=5.79]
train minibatch loop:   0%|          | 1/625 [00:00<01:52,  5.53it/s, accuracy=0.139, cost=5.55]

time taken: 139.64409923553467
epoch: 15, training loss: 5.672334, training acc: 0.138379, valid loss: 5.864244, valid acc: 0.147645



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.24it/s, accuracy=0.163, cost=5.4] 
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.37it/s, accuracy=0.158, cost=5.58]
train minibatch loop:   0%|          | 1/625 [00:00<01:54,  5.46it/s, accuracy=0.147, cost=5.37]

time taken: 139.67336773872375
epoch: 16, training loss: 5.490896, training acc: 0.153581, valid loss: 5.670244, valid acc: 0.165220



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.25it/s, accuracy=0.171, cost=5.22]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.36it/s, accuracy=0.163, cost=5.46]
train minibatch loop:   0%|          | 1/625 [00:00<01:52,  5.54it/s, accuracy=0.157, cost=5.17]

time taken: 139.7850182056427
epoch: 17, training loss: 5.279509, training acc: 0.170062, valid loss: 5.516410, valid acc: 0.176268



train minibatch loop: 100%|██████████| 625/625 [02:02<00:00,  5.25it/s, accuracy=0.179, cost=5.03]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.37it/s, accuracy=0.167, cost=5.27]
train minibatch loop:   0%|          | 1/625 [00:00<01:53,  5.48it/s, accuracy=0.172, cost=4.97]

time taken: 139.62388563156128
epoch: 18, training loss: 5.114672, training acc: 0.179379, valid loss: 5.334734, valid acc: 0.188212



train minibatch loop: 100%|██████████| 625/625 [02:03<00:00,  5.23it/s, accuracy=0.202, cost=4.83]
test minibatch loop: 100%|██████████| 157/157 [00:16<00:00,  9.33it/s, accuracy=0.199, cost=5.13]

time taken: 139.90587043762207
epoch: 19, training loss: 4.948953, training acc: 0.190932, valid loss: 5.181239, valid acc: 0.203860






In [14]:
class Hypothesis:
    def __init__(self, log_prob, seq):
        self.log_prob = log_prob
        self.seq = seq

    @property
    def step(self):
        return len(self.seq) - 1


def beam_search(
    batch_x,
    beam_size,
    num_ans = 50,
    normalize_by_len = 1.0,
):
    assert 0 <= normalize_by_len <= 1
    batch_size = len(batch_x)
    max_len = len(batch_x[0]) * 2
    dec_inputs = np.ones((batch_size, 2), dtype=np.int32)
    answers = [[] for i in range(batch_size)]
    H = [[] for i in range(batch_size)]
    
    tkl, tkid = sess.run([model.topk_logprobs, 
                          model.topk_ids],
                         feed_dict = {model.X: batch_x,
                                     model.Y: dec_inputs,
                                     model.k: beam_size})
    
    for i in range(batch_size):
        for j, log_prob in enumerate(tkl[i, 0]):
            if tkid[i, 0, j] != EOS:
                h = Hypothesis(log_prob, [1, tkid[i, 0, j]])
                H[i].append(h)
        H[i].sort(key=lambda h: h.log_prob)
    
    done = [False] * batch_size
    while not all(done):
        tkl_beam = []
        tkid_beam = []
        dec_inputs_beam = []
        steps_beam = []
        for i in range(beam_size):
            steps = [1] * batch_size
            prev_log_probs = np.zeros(batch_size, dtype=np.float32)
            dec_inputs = np.ones((batch_size, max_len), dtype=np.int32)
            for j, h in enumerate(H):
                while h:
                    hi = h.pop()
                    lp, step, candidate_seq = hi.log_prob, hi.step, hi.seq
                    if candidate_seq[-1] != EOS:
                        dec_inputs[j, :len(candidate_seq)] = candidate_seq
                        steps[j] = step
                        prev_log_probs[j] = lp
                        break
                    else:
                        answers[j].append((lp, candidate_seq))
            max_step = max(steps)
            dec_inputs = dec_inputs[:, :max_step + 2]
            tkl, tkid = sess.run([model.topk_logprobs, 
                          model.topk_ids],
                         feed_dict = {model.X: batch_x,
                                     model.Y: dec_inputs,
                                     model.k: beam_size})
            tkl_beam.append(tkl + prev_log_probs[:, None, None])
            tkid_beam.append(tkid)
            dec_inputs_beam.append(dec_inputs.copy())
            steps_beam.append(steps)
        for i in range(beam_size):
            tkl = tkl_beam[i]
            tkid = tkid_beam[i]
            dec_inputs = dec_inputs_beam[i]
            steps = steps_beam[i]
            for j in range(batch_size):
                step = steps[j]
                for k in range(tkid.shape[2]):
                    extended_seq = np.hstack((dec_inputs[j, :step+1], [tkid[j, step, k]]))
                    log_prob = tkl[j, step, k]
                    if len(extended_seq) <= max_len and log_prob > -10:
                        h = Hypothesis(log_prob, extended_seq)
                        H[j].append(h)
                H[j].sort(key=lambda h: h.log_prob / (h.step**normalize_by_len))
            
        for i in range(batch_size):
            done[i] = (len(answers[i]) >= num_ans) or (not H[i]) or (len(H[i]) > 100)
            
    return answers

In [15]:
beamed = beam_search(batch_x, 5)

In [16]:
beamed = [i for i in beamed if len(i)]
predicted = [max(b, key = lambda t: t[0])[1] for b in beamed]


In [17]:
predicted

[]