In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import json
import collections
from sklearn.cross_validation import train_test_split
from pointer_generator_helper import (PointerGeneratorDecoder, 
                                      PointerGeneratorGreedyEmbeddingHelper, 
                                      PointerGeneratorBahdanauAttention,
                                      PointerGeneratorAttentionWrapper)
from tensor2tensor.utils import beam_search, rouge



In [2]:
with open('news-30k.json') as fopen:
    news = json.load(fopen)
len(news)

29855

In [3]:
news[0]

{'title': 'Ibu saudara Haziq terharu sokongan rakyat Malaysia',
 'url': 'https://www.themalaysianinsight.com/bahasa/s/142491',
 'news': 'themalaysianinsight',
 'language': 'malay',
 'top-image': 'https://www.themalaysianinsight.com/resources/stories_images/142491/perhimpunanan_solidarity_kedamaian_03__full.jpg',
 'text': 'WARIS keluarga Allahyarham Muhammad Haziq Mohd Tarmizi, 17, yang terkorban dalam tragedi tembakan di Christchurch, New Zealand, pada 15 Mac lepas, melahirkan rasa terharu akan sokongan diberi rakyat Malaysia semasa perhimpunan Solidariti Kedamaian.\n\nZarina Shuib , ibu saudara Muhammad Haziq, memanjatkan kesyukuran kepada Allah SWT kerana berkesempatan menyertai rakyat Malaysia dalam perhimpunan itu di Kuala Lumpur hari ini.',
 'date': '2019-03-23T03:52:02',
 'date_utc': '2019-03-22T19:52:02'}

In [4]:
import malaya
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [5]:
from tqdm import tqdm

min_len = 5
max_len = 500

x, y = [], []
for n in tqdm(news):
    if len(n['text'].split()) > min_len:
        p = preprocessing(n['text'])[:max_len]
        x.append(p)
        p = preprocessing(n['title'])
        y.append(p)

100%|██████████| 29855/29855 [00:45<00:00, 662.19it/s]


In [6]:
len(x), len(y)

(29855, 29855)

In [7]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [8]:
import itertools

concat = list(itertools.chain(*x))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 88005
Most common words [(',', 380933), ('.', 338805), ('yang', 158373), ('dan', 147862), ('di', 124501), ('-', 118778)]
Sample data [4340, 287, 1410, 343, 1606, 114, 3583, 4, 10, 4] ['waris', 'keluarga', 'allahyarham', 'muhammad', 'haziq', 'mohd', 'tarmizi', ',', '<NUM>', ',']
filtered vocab size: 88009
% of vocab used: 100.0%


In [9]:
for i in range(len(y)):
    y[i].append('EOS')

In [10]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(x, y, test_size = 0.1)

In [12]:
def sent2idx(sent, vocab, UNK=UNK):
    tokens = sent
    oovs = []
    extend_tokens = []
    tokenized = []
    for token in tokens:
        if token not in vocab:
            tokenized.append(UNK)
            if token not in oovs:
                oovs.append(token)
            extend_tokens.append(len(vocab) + oovs.index(token))
        else:
            extend_tokens.append(vocab[token])
            tokenized.append(vocab[token])
    return tokenized, extend_tokens, oovs

def target2idx(sent, oovs, vocab,UNK=UNK):
    tokens = sent
    tokenized = []
    for token in tokens:
        if token not in vocab:
            if token not in oovs:
                tokenized.append(UNK)
            else:
                tokenized.append(len(vocab) + oovs.index(token))
        else:
            tokenized.append(vocab[token])
    return tokenized

In [13]:
class Summarization:
    def __init__(self, size_layer, num_layers, embedded_size, dict_size):
        
        def lstm_cell(reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        self.source_oov_words = tf.placeholder(tf.int32, shape=[])
        self.source_extend_tokens = tf.placeholder(tf.int32, shape=[None, None])
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        condition = tf.less(decoder_input, dict_size)
        self.decoder_input = decoder_input
        self.decoder_input_length = self.Y_seq_len
        self.predict_count = tf.reduce_sum(self.decoder_input_length)
        
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(embeddings, self.X)
        encoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
        self.encoder_out, self.encoder_state = tf.nn.dynamic_rnn(cell = encoder_cells, 
                                                                 inputs = encoder_embedded, 
                                                                 sequence_length = self.X_seq_len,
                                                                 dtype = tf.float32)
        self.decode_initial_state = self.encoder_state[-1]
        print(self.decode_initial_state)
        
        atten_mech = PointerGeneratorBahdanauAttention(
                size_layer, self.encoder_out, memory_sequence_length=self.X_seq_len,
        coverage = True)
        decoder_cells = [lstm_cell() for _ in range(num_layers)]
        decoder_cells[0] = PointerGeneratorAttentionWrapper(
                cell=decoder_cells[0],
                attention_mechanism=atten_mech,
                attention_layer_size=size_layer,
                alignment_history = True,
                coverage = True
            )
        initial_state = [self.decode_initial_state for i in range(num_layers)]
        attention_cell_state = decoder_cells[0].zero_state(
                dtype=tf.float32, batch_size=batch_size)
        initial_state[0] = attention_cell_state.clone(
                cell_state=initial_state[0])
        self.initial_state = tuple(initial_state)
        decoder_cells = tf.contrib.rnn.MultiRNNCell(decoder_cells)
        
        decoded = tf.nn.embedding_lookup(embeddings, self.decoder_input)
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
            decoded,
            self.decoder_input_length
        )
        dense_layer = tf.layers.Dense(dict_size)
        
        training_decoder = PointerGeneratorDecoder(
            source_extend_tokens = self.source_extend_tokens,
            source_oov_words = self.source_oov_words,
            coverage = True,
            cell=decoder_cells,
            helper=training_helper,
            initial_state=self.initial_state,
            output_layer=dense_layer
        )
        
        maxlen = tf.reduce_max(self.decoder_input_length)
        train_dec_outputs, train_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
            training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=maxlen,
            swap_memory=True)
        logits = train_dec_outputs.rnn_output
        self.training_logits = logits
        
        masks = tf.sequence_mask(
            self.decoder_input_length, maxlen, 
            dtype=tf.float32)
        
        targets = tf.slice(self.Y, [0, 0], [-1, maxlen])
        i1, i2 = tf.meshgrid(tf.range(batch_size),
                     tf.range(maxlen), indexing="ij")
        indices = tf.stack((i1,i2,targets),axis=2)
        probs = tf.gather_nd(logits, indices)
        probs = tf.where(tf.less_equal(probs,0),tf.ones_like(probs)*1e-10,probs)
        crossent = -tf.log(probs)
        self.cost = tf.reduce_sum(crossent * masks) / tf.to_float(batch_size)
        alignment_history = train_dec_last_state[0].alignment_history.stack()
        alignment_history = tf.transpose(alignment_history,[1,2,0])
        coverage_loss = tf.minimum(alignment_history,tf.cumsum(alignment_history, axis=2, exclusive=True))
        self.coverage_loss = tf.reduce_sum(coverage_loss / tf.to_float(batch_size))
        self.cost = self.cost + self.coverage_loss
        
        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)
        
        helper = PointerGeneratorGreedyEmbeddingHelper(
            embedding=embeddings,
            start_tokens=tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
            end_token=EOS
        )
        
        inference_decoder = PointerGeneratorDecoder(
            source_extend_tokens = self.source_extend_tokens,
            source_oov_words = self.source_oov_words,
            coverage = True,
            cell=decoder_cells,
            helper=helper,
            initial_state=self.initial_state,
            output_layer=dense_layer
        )
        
        dec_outputs, dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
            inference_decoder,
            output_time_major=False,
            maximum_iterations=tf.reduce_max(self.X_seq_len),
            swap_memory=True)
        
        self.beam_predictions = dec_outputs.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        y_t = tf.argmax(logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [14]:
size_layer = 256
num_layers = 2
embedded_size = 256
batch_size = 6

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Summarization(size_layer, num_layers, embedded_size, len(dictionary))
sess.run(tf.global_variables_initializer())

Tensor("rnn/while/Exit_4:0", shape=(?, 256), dtype=float32)
<pointer_generator_helper.PointerGeneratorBahdanauAttention object at 0x7fdbc9665ba8>
Tensor("decoder/while/PGDecoderStep/decoder/multi_rnn_cell/cell_0/cell_0/pointer_generator_attention_wrapper/Softmax:0", shape=(?, ?), dtype=float32) Tensor("decoder/while/PGDecoderStep/decoder/multi_rnn_cell/cell_0/cell_0/pointer_generator_attention_wrapper/Softmax:0", shape=(?, ?), dtype=float32)
<pointer_generator_helper.PointerGeneratorBahdanauAttention object at 0x7fdbc9665ba8>
Tensor("decoder/while/PGDecoderStep/decoder/multi_rnn_cell/cell_0/cell_0/pointer_generator_attention_wrapper/Softmax_1:0", shape=(?, ?), dtype=float32) Tensor("decoder/while/PGDecoderStep/decoder/multi_rnn_cell/cell_0/cell_0/pointer_generator_attention_wrapper/Softmax_1:0", shape=(?, ?), dtype=float32)


In [16]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [17]:
def batching(X, Y):
    s_, es_, oovs_, target_ = [], [], [], []
    for x, y in zip(X, Y):
        s,es,oovs = sent2idx(x, dictionary)
        target = target2idx(y, oovs,dictionary)
        s_.append(s)
        es_.append(es)
        oovs_.append(oovs)
        target_.append(target)
    s_ = pad_sequences(s_,padding='post')
    es_ = pad_sequences(es_,padding='post')
    target_ = pad_sequences(target_,padding='post')
    maxlen = max([len(o) for o in oovs_])
    return s_, es_, target_, maxlen

In [18]:
from keras.preprocessing import sequence

def calculate_rouges(predicted, batch_y):
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False)
    batch_y = np.array(batch_y)
    rouges = []
    for i in range(predicted.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        rouges.append(rouge.rouge_n([p], [a]))
    return np.mean(rouges)

In [19]:
from tqdm import tqdm
from sklearn.utils import shuffle
import time

for EPOCH in range(20):
    lasttime = time.time()
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    rouge_train, rouge_test = 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(train_X))
        batch_x, batch_es, batch_y, maxlen = batching(train_X[k: index],
                                                     train_Y[k: index])
        l, acc, loss, _ = sess.run([model.training_logits, model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                 model.source_extend_tokens:batch_es,
                                                model.Y:batch_y,
                                                model.source_oov_words:maxlen})
        total_loss += loss
        total_accuracy += acc
        r = calculate_rouges(np.argmax(l, axis = 2), batch_y)
        rouge_train += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(test_X))
        batch_x, batch_es, batch_y, maxlen = batching(test_X[k: index],
                                                     test_Y[k: index])
        l, acc, loss = sess.run([model.training_logits, model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                 model.source_extend_tokens:batch_es,
                                                model.Y:batch_y,
                                                model.source_oov_words:maxlen})
        total_loss_test += loss
        total_accuracy_test += acc
        r = calculate_rouges(np.argmax(l, axis = 2), batch_y)
        rouge_test += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
    rouge_train /= (len(train_X) / batch_size)
    rouge_test /= (len(test_X) / batch_size)
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))
    print('epoch: %d, avg train rouge: %f, avg test rouge: %f'%(EPOCH, rouge_train, rouge_test))

train minibatch loop: 100%|██████████| 4479/4479 [41:54<00:00,  2.20it/s, accuracy=0.111, cost=36.1, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.07it/s, accuracy=0.246, cost=91, rouge_2=0.188]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 0, avg loss: 43.339262, avg accuracy: 0.280213
epoch: 0, avg loss test: 38.250575, avg accuracy test: 0.340743
epoch: 0, avg train rouge: 0.109591, avg test rouge: 0.149172


train minibatch loop: 100%|██████████| 4479/4479 [41:36<00:00,  2.21it/s, accuracy=0.556, cost=23.3, rouge_2=0.375]  
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.08it/s, accuracy=0.262, cost=82.9, rouge_2=0.258] 
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 1, avg loss: 38.876651, avg accuracy: 0.326685
epoch: 1, avg loss test: 36.583588, avg accuracy test: 0.355238
epoch: 1, avg train rouge: 0.137114, avg test rouge: 0.158392


train minibatch loop: 100%|██████████| 4479/4479 [41:33<00:00,  2.20it/s, accuracy=0.111, cost=36.9, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.05it/s, accuracy=0.197, cost=106, rouge_2=0.0723] 
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 2, avg loss: 31.740749, avg accuracy: 0.403784
epoch: 2, avg loss test: 48.051658, avg accuracy test: 0.255413
epoch: 2, avg train rouge: 0.195708, avg test rouge: 0.084357


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.21it/s, accuracy=0.667, cost=25.3, rouge_2=0.375] 
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.12it/s, accuracy=0.23, cost=98, rouge_2=0.22]     
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 3, avg loss: 37.722801, avg accuracy: 0.329731
epoch: 3, avg loss test: 42.431249, avg accuracy test: 0.296041
epoch: 3, avg train rouge: 0.134821, avg test rouge: 0.115910


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.21it/s, accuracy=0.333, cost=42.8, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.11it/s, accuracy=0.0984, cost=110, rouge_2=0.178]  
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 4, avg loss: 39.667968, avg accuracy: 0.292814
epoch: 4, avg loss test: 46.339806, avg accuracy test: 0.243682
epoch: 4, avg train rouge: 0.110209, avg test rouge: 0.074491


train minibatch loop: 100%|██████████| 4479/4479 [41:36<00:00,  2.20it/s, accuracy=0.444, cost=32, rouge_2=0.125]    
test minibatch loop: 100%|██████████| 498/498 [01:21<00:00,  5.07it/s, accuracy=0.197, cost=98.8, rouge_2=0.0714]
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 5, avg loss: 41.869029, avg accuracy: 0.265789
epoch: 5, avg loss test: 44.102097, avg accuracy test: 0.266624
epoch: 5, avg train rouge: 0.089265, avg test rouge: 0.093176


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.21it/s, accuracy=0.333, cost=32.2, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:21<00:00,  5.11it/s, accuracy=0.164, cost=102, rouge_2=0.162]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 6, avg loss: 40.201741, avg accuracy: 0.274965
epoch: 6, avg loss test: 44.676245, avg accuracy test: 0.258674
epoch: 6, avg train rouge: 0.094593, avg test rouge: 0.088101


train minibatch loop: 100%|██████████| 4479/4479 [41:36<00:00,  2.21it/s, accuracy=0.222, cost=37.9, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.10it/s, accuracy=0.082, cost=106, rouge_2=0.213]  
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 7, avg loss: 43.714808, avg accuracy: 0.242870
epoch: 7, avg loss test: 46.924372, avg accuracy test: 0.234631
epoch: 7, avg train rouge: 0.076977, avg test rouge: 0.074650


train minibatch loop: 100%|██████████| 4479/4479 [41:35<00:00,  2.21it/s, accuracy=0.333, cost=32.2, rouge_2=0]     
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.04it/s, accuracy=0.197, cost=104, rouge_2=0.039]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 8, avg loss: 39.921322, avg accuracy: 0.276835
epoch: 8, avg loss test: 46.324896, avg accuracy test: 0.242812
epoch: 8, avg train rouge: 0.099394, avg test rouge: 0.076692


train minibatch loop: 100%|██████████| 4479/4479 [41:39<00:00,  2.22it/s, accuracy=0.444, cost=28.1, rouge_2=0.125]  
test minibatch loop: 100%|██████████| 498/498 [01:21<00:00,  5.09it/s, accuracy=0.18, cost=101, rouge_2=0.117]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 9, avg loss: 40.963847, avg accuracy: 0.258915
epoch: 9, avg loss test: 44.381440, avg accuracy test: 0.256690
epoch: 9, avg train rouge: 0.088552, avg test rouge: 0.092548


train minibatch loop: 100%|██████████| 4479/4479 [41:39<00:00,  2.21it/s, accuracy=0.222, cost=35.4, rouge_2=0]     
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.08it/s, accuracy=0.18, cost=105, rouge_2=0.112]    
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 10, avg loss: 41.393298, avg accuracy: 0.256007
epoch: 10, avg loss test: 46.377687, avg accuracy test: 0.238520
epoch: 10, avg train rouge: 0.085633, avg test rouge: 0.072841


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.22it/s, accuracy=0.222, cost=35.5, rouge_2=0.125]  
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.03it/s, accuracy=0.0984, cost=106, rouge_2=0.0278]
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 11, avg loss: 41.463504, avg accuracy: 0.254737
epoch: 11, avg loss test: 48.693768, avg accuracy test: 0.211095
epoch: 11, avg train rouge: 0.084007, avg test rouge: 0.053379


train minibatch loop: 100%|██████████| 4479/4479 [41:33<00:00,  2.21it/s, accuracy=0.111, cost=39.4, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.05it/s, accuracy=0.115, cost=108, rouge_2=0.0995]  
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 12, avg loss: 45.051455, avg accuracy: 0.211254
epoch: 12, avg loss test: 47.529852, avg accuracy test: 0.230337
epoch: 12, avg train rouge: 0.055993, avg test rouge: 0.071783


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.21it/s, accuracy=0.222, cost=39.3, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:21<00:00,  5.17it/s, accuracy=0.18, cost=110, rouge_2=0.0361]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 13, avg loss: 47.724544, avg accuracy: 0.189737
epoch: 13, avg loss test: 51.746153, avg accuracy test: 0.185324
epoch: 13, avg train rouge: 0.048624, avg test rouge: 0.050050


train minibatch loop: 100%|██████████| 4479/4479 [41:32<00:00,  2.21it/s, accuracy=0.222, cost=38.8, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.10it/s, accuracy=0.0984, cost=115, rouge_2=0.118]  
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 14, avg loss: 43.576472, avg accuracy: 0.216453
epoch: 14, avg loss test: 51.974185, avg accuracy test: 0.180553
epoch: 14, avg train rouge: 0.064396, avg test rouge: 0.051724


train minibatch loop: 100%|██████████| 4479/4479 [41:33<00:00,  2.21it/s, accuracy=0.333, cost=28.2, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.06it/s, accuracy=0.131, cost=112, rouge_2=0.163]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 15, avg loss: 41.314440, avg accuracy: 0.236375
epoch: 15, avg loss test: 47.696124, avg accuracy test: 0.229250
epoch: 15, avg train rouge: 0.079887, avg test rouge: 0.079132


train minibatch loop: 100%|██████████| 4479/4479 [41:33<00:00,  2.21it/s, accuracy=0.222, cost=38.3, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.12it/s, accuracy=0.164, cost=111, rouge_2=0.0714]  
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 16, avg loss: 44.112664, avg accuracy: 0.214864
epoch: 16, avg loss test: 50.285478, avg accuracy test: 0.193462
epoch: 16, avg train rouge: 0.061069, avg test rouge: 0.056818


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.21it/s, accuracy=0.333, cost=37.1, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.10it/s, accuracy=0.148, cost=114, rouge_2=0.188]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 17, avg loss: 43.175520, avg accuracy: 0.219689
epoch: 17, avg loss test: 50.357799, avg accuracy test: 0.198479
epoch: 17, avg train rouge: 0.065411, avg test rouge: 0.061611


train minibatch loop: 100%|██████████| 4479/4479 [41:33<00:00,  2.21it/s, accuracy=0.222, cost=32.1, rouge_2=0]      
test minibatch loop: 100%|██████████| 498/498 [01:22<00:00,  5.08it/s, accuracy=0.115, cost=112, rouge_2=0.179]   
train minibatch loop:   0%|          | 0/4479 [00:00<?, ?it/s]

epoch: 18, avg loss: 42.146986, avg accuracy: 0.234126
epoch: 18, avg loss test: 48.526205, avg accuracy test: 0.216265
epoch: 18, avg train rouge: 0.080118, avg test rouge: 0.077062


train minibatch loop: 100%|██████████| 4479/4479 [41:34<00:00,  2.20it/s, accuracy=0.111, cost=30.1, rouge_2=0.125]  
test minibatch loop: 100%|██████████| 498/498 [01:21<00:00,  5.11it/s, accuracy=0.131, cost=108, rouge_2=0.0625]  

epoch: 19, avg loss: 39.921742, avg accuracy: 0.251466
epoch: 19, avg loss test: 48.348149, avg accuracy test: 0.217966
epoch: 19, avg train rouge: 0.089897, avg test rouge: 0.078513





In [25]:
batch_x, batch_es, batch_y, maxlen = batching(test_X[:1], test_Y[:1])

In [26]:
def f7(seq):
    seen = set()
    seen_add = seen.add
    return ' '.join([x for x in seq if not (x in seen or seen_add(x))])

In [30]:
out = sess.run(model.beam_predictions, feed_dict = {model.X: batch_x,
                                             model.Y: batch_y,
                                             model.source_extend_tokens:batch_es,
                                             model.source_oov_words:maxlen})[0]
out = [rev_dictionary[i] for i in out]

In [31]:
out

['putrajaya', ',', ',', 'EOS']

In [32]:
f7(out)

'putrajaya , EOS'

In [33]:
test_Y[0]

['rakyat',
 'akan',
 'sedar',
 'kami',
 'lebih',
 'baik',
 'berbanding',
 'bn',
 ',',
 'kata',
 'kok',
 'EOS']