In [1]:
import json
import numpy as np
import tensorflow as tf
import collections
import os
from sklearn.cross_validation import train_test_split
from tensor2tensor.utils import beam_search, rouge



In [2]:
with open('news-30k.json') as fopen:
    news = json.load(fopen)
len(news)

29855

In [3]:
import malaya
import re
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [4]:
from tqdm import tqdm

min_len = 5
max_len = 500

x, y = [], []
for n in tqdm(news):
    if len(n['text'].split()) > min_len:
        p = preprocessing(n['text'])[:max_len]
        x.append(p)
        p = preprocessing(n['title'])
        y.append(p)

100%|██████████| 29855/29855 [00:45<00:00, 663.26it/s]


In [5]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [6]:
import itertools

concat = list(itertools.chain(*x))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 88005
Most common words [(',', 380933), ('.', 338805), ('yang', 158373), ('dan', 147862), ('di', 124501), ('-', 118778)]
Sample data [4340, 287, 1410, 343, 1606, 114, 3583, 4, 10, 4] ['waris', 'keluarga', 'allahyarham', 'muhammad', 'haziq', 'mohd', 'tarmizi', ',', '<NUM>', ',']
filtered vocab size: 88009
% of vocab used: 100.0%


In [7]:
for i in range(len(y)):
    y[i].append('EOS')

In [8]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [9]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k, UNK))
        X.append(ints)
    return X

In [10]:
X = str_idx(x, dictionary)
Y = str_idx(y, dictionary)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.1)

In [12]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Summarization:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 dict_size, learning_rate, 
                 kernel_size = 2, n_attn_heads = 16):

        self.X = tf.placeholder(tf.int32, [None, max_len])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        self.batch_size = batch_size
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        self.embedding = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        
        self.num_layers = num_layers
        self.kernel_size = kernel_size
        self.size_layer = size_layer
        self.n_attn_heads = n_attn_heads
        self.dict_size = dict_size
        
        self.training_logits, coverage_loss = self.forward(self.X, decoder_input)
        maxlen = tf.reduce_max(self.Y_seq_len)
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        
        targets = tf.slice(self.Y, [0, 0], [-1, maxlen])
        i1, i2 = tf.meshgrid(tf.range(batch_size),
                     tf.range(maxlen), indexing="ij")
        indices = tf.stack((i1,i2,targets),axis=2)
        probs = tf.gather_nd(self.training_logits, indices)
        probs = tf.where(tf.less_equal(probs,0),tf.ones_like(probs)*1e-10,probs)
        crossent = -tf.log(probs)
        self.cost = tf.reduce_sum(crossent * masks) / tf.to_float(batch_size)
        self.coverage_loss = tf.reduce_sum(coverage_loss / tf.to_float(batch_size))
        self.cost = self.cost + self.coverage_loss
    
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        
    def _calc_final_dist(self, x, gens, vocab_dists, attn_dists):
        with tf.variable_scope('final_distribution', reuse=tf.AUTO_REUSE):
            vocab_dists = gens * vocab_dists
            attn_dists = (1-gens) * attn_dists
            batch_size = tf.shape(attn_dists)[0]
            dec_t = tf.shape(attn_dists)[1]
            attn_len = tf.shape(attn_dists)[2]

            dec = tf.range(0, limit=dec_t) # [dec]
            dec = tf.expand_dims(dec, axis=-1) # [dec, 1]
            dec = tf.tile(dec, [1, attn_len]) # [dec, atten_len]
            dec = tf.expand_dims(dec, axis=0) # [1, dec, atten_len]
            dec = tf.tile(dec, [batch_size, 1, 1]) # [batch_size, dec, atten_len]

            x = tf.expand_dims(x, axis=1) # [batch_size, 1, atten_len]
            x = tf.tile(x, [1, dec_t, 1]) # [batch_size, dec, atten_len]
            x = tf.stack([dec, x], axis=3)

            attn_dists_projected = tf.map_fn(fn=lambda y: \
                                            tf.scatter_nd(y[0], y[1], [dec_t, self.dict_size]),
                                            elems=(x, attn_dists), dtype=tf.float32)

            final_dists = attn_dists_projected + vocab_dists
            return final_dists
    def forward(self, x, y, reuse = False):
        with tf.variable_scope('forward',reuse=reuse):
            with tf.variable_scope('forward',reuse=reuse):
                encoder_embedded = tf.nn.embedding_lookup(self.embedding, x)
                decoder_embedded = tf.nn.embedding_lookup(self.embedding, y)

                encoder_embedded += position_encoding(encoder_embedded)
                decoder_embedded += position_encoding(decoder_embedded)
                
                for i in range(self.num_layers): 
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate 
                    with tf.variable_scope('block_%d'%i,reuse=reuse):
                        encoder_embedded += cnn_block(encoder_embedded, dilation_rate, 
                                                      pad_sz, self.size_layer, self.kernel_size)
                
                g = tf.identity(decoder_embedded)
                dec = decoder_embedded
                attn_dists = []
                for i in range(self.num_layers):
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate
                    with tf.variable_scope('decode_%d'%i,reuse=reuse):
                        attn_res = h = cnn_block(dec, dilation_rate, 
                                                 pad_sz, self.size_layer, self.kernel_size)
                        C = []
                        for j in range(self.n_attn_heads):
                            h_ = tf.layers.dense(h, self.size_layer//self.n_attn_heads)
                            g_ = tf.layers.dense(g, self.size_layer//self.n_attn_heads)
                            zu_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)
                            ze_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)

                            d = tf.layers.dense(h_, self.size_layer//self.n_attn_heads) + g_
                            dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                            a = tf.nn.softmax(dz)
                            attn_dists.append(a)
                            c_ = tf.matmul(a, ze_)
                            C.append(c_)

                        c = tf.concat(C, 2)
                        h = tf.layers.dense(attn_res + c, self.size_layer)
                        dec += h
                

                weights = tf.transpose(self.embedding)
                logits = tf.einsum('ntd,dk->ntk', dec, weights)
                print(decoder_embedded, dec, attn_dists[-1])

                with tf.variable_scope("gen", reuse=tf.AUTO_REUSE):
                    gens = tf.layers.dense(tf.concat([decoder_embedded, dec, attn_dists[-1]], axis=-1), 
                                               units=1, activation=tf.sigmoid, use_bias=False)
                    
                logits = tf.nn.softmax(logits)
                
                print(gens)
                alignment_history = tf.transpose(attn_dists[-1],[1,2,0])
                coverage_loss = tf.minimum(alignment_history,tf.cumsum(alignment_history, axis=2, exclusive=True))
                        
                return self._calc_final_dist(x, gens, logits, attn_dists[-1]), coverage_loss

In [13]:
size_layer = 256
num_layers = 4
embedded_size = 256
learning_rate = 1e-3
batch_size = 16
epoch = 20

In [14]:
def beam_search_decoding(length = 20, beam_width = 5):
    initial_ids = tf.fill([model.batch_size], GO)
    
    def symbols_to_logits(ids):
        x = tf.contrib.seq2seq.tile_batch(model.X, beam_width)
        logits, _ = model.forward(x, ids, reuse = True)
        return logits[:, tf.shape(ids)[1]-1, :]

    final_ids, final_probs = beam_search.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_width,
        length,
        len(dictionary),
        0.0,
        eos_id = EOS)
    
    return final_ids

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Summarization(size_layer, num_layers, embedded_size, 
                      len(dictionary), learning_rate)
model.generate = beam_search_decoding()
sess.run(tf.global_variables_initializer())

Tensor("forward/forward/add_1:0", shape=(?, ?, 256), dtype=float32) Tensor("forward/forward/decode_3/add_19:0", shape=(?, ?, 256), dtype=float32) Tensor("forward/forward/decode_3/Reshape_31:0", shape=(?, ?, 500), dtype=float32)
Tensor("forward/forward/gen/dense/Sigmoid:0", shape=(?, ?, 1), dtype=float32)
Tensor("while/forward/forward/add_1:0", shape=(?, ?, 256), dtype=float32) Tensor("while/forward/forward/decode_3/add_19:0", shape=(?, ?, 256), dtype=float32) Tensor("while/forward/forward/decode_3/Reshape_31:0", shape=(?, ?, 500), dtype=float32)
Tensor("while/forward/forward/gen/dense/Sigmoid:0", shape=(?, ?, 1), dtype=float32)


In [16]:
def pad_sentence_batch(sentence_batch, pad_int, maxlen = None):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    if maxlen:
        max_sentence_len = maxlen
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [17]:
from tqdm import tqdm
from sklearn.utils import shuffle
import time

for EPOCH in range(20):
    lasttime = time.time()
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    rouge_train, rouge_test = 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(train_X))
        batch_x, _ = pad_sentence_batch(train_X[k: index], PAD, maxlen = max_len)
        batch_y, _ = pad_sentence_batch(train_Y[k: index], PAD)
        l, acc, loss, _ = sess.run([model.training_logits, model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += acc
        r = rouge.rouge_n(np.argmax(l, axis = 2), batch_y)
        rouge_train += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(test_X))
        batch_x, _ = pad_sentence_batch(test_X[k: index], PAD, maxlen = max_len)
        batch_y, _ = pad_sentence_batch(test_Y[k: index], PAD)
        l, acc, loss = sess.run([model.training_logits, model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss_test += loss
        total_accuracy_test += acc
        r = rouge.rouge_n(np.argmax(l, axis = 2), batch_y)
        rouge_test += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
    rouge_train /= (len(train_X) / batch_size)
    rouge_test /= (len(test_X) / batch_size)
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))
    print('epoch: %d, avg train rouge: %f, avg test rouge: %f'%(EPOCH, rouge_train, rouge_test))

train minibatch loop: 100%|██████████| 1680/1680 [08:47<00:00,  3.35it/s, accuracy=0.267, cost=64.9, rouge_2=0.154]  
test minibatch loop: 100%|██████████| 187/187 [00:25<00:00,  7.68it/s, accuracy=0.275, cost=99.7, rouge_2=0.142]
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 0, avg loss: 86.886271, avg accuracy: 0.223780
epoch: 0, avg loss test: 81.606984, avg accuracy test: 0.290207
epoch: 0, avg train rouge: 0.107952, avg test rouge: 0.185198


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.66it/s, accuracy=0.378, cost=58.7, rouge_2=0.259]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.42it/s, accuracy=0.343, cost=96.5, rouge_2=0.218] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 1, avg loss: 78.740744, avg accuracy: 0.330643
epoch: 1, avg loss test: 79.704315, avg accuracy test: 0.322280
epoch: 1, avg train rouge: 0.207342, avg test rouge: 0.204983


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.70it/s, accuracy=0.556, cost=54.3, rouge_2=0.343]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.31it/s, accuracy=0.363, cost=96.1, rouge_2=0.22] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 2, avg loss: 75.497073, avg accuracy: 0.382726
epoch: 2, avg loss test: 79.292818, avg accuracy test: 0.336576
epoch: 2, avg train rouge: 0.240604, avg test rouge: 0.214688


train minibatch loop: 100%|██████████| 1680/1680 [08:41<00:00,  3.70it/s, accuracy=0.644, cost=51.2, rouge_2=0.438]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.55it/s, accuracy=0.373, cost=96.3, rouge_2=0.229]
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 3, avg loss: 72.983652, avg accuracy: 0.426988
epoch: 3, avg loss test: 79.604103, avg accuracy test: 0.343427
epoch: 3, avg train rouge: 0.271075, avg test rouge: 0.222683


train minibatch loop: 100%|██████████| 1680/1680 [08:41<00:00,  3.71it/s, accuracy=0.667, cost=49.6, rouge_2=0.456]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.39it/s, accuracy=0.402, cost=96.9, rouge_2=0.263]
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 4, avg loss: 70.839966, avg accuracy: 0.468914
epoch: 4, avg loss test: 80.619893, avg accuracy test: 0.347623
epoch: 4, avg train rouge: 0.302369, avg test rouge: 0.229399


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.72it/s, accuracy=0.689, cost=48.1, rouge_2=0.466]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.44it/s, accuracy=0.382, cost=98.4, rouge_2=0.267]
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 5, avg loss: 69.008020, avg accuracy: 0.506389
epoch: 5, avg loss test: 82.161340, avg accuracy test: 0.347463
epoch: 5, avg train rouge: 0.332098, avg test rouge: 0.228164


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.71it/s, accuracy=0.711, cost=47.7, rouge_2=0.494]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.98it/s, accuracy=0.363, cost=102, rouge_2=0.242] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 6, avg loss: 67.498303, avg accuracy: 0.538135
epoch: 6, avg loss test: 83.989594, avg accuracy test: 0.345328
epoch: 6, avg train rouge: 0.359017, avg test rouge: 0.229533


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.62it/s, accuracy=0.711, cost=47.7, rouge_2=0.534]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.49it/s, accuracy=0.392, cost=103, rouge_2=0.263] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 7, avg loss: 66.321964, avg accuracy: 0.562008
epoch: 7, avg loss test: 86.405367, avg accuracy test: 0.340035
epoch: 7, avg train rouge: 0.379941, avg test rouge: 0.225653


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.70it/s, accuracy=0.733, cost=47.4, rouge_2=0.529]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.41it/s, accuracy=0.431, cost=105, rouge_2=0.298] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 8, avg loss: 65.396839, avg accuracy: 0.582453
epoch: 8, avg loss test: 88.289230, avg accuracy test: 0.345654
epoch: 8, avg train rouge: 0.398326, avg test rouge: 0.231356


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.71it/s, accuracy=0.733, cost=47.7, rouge_2=0.516]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.51it/s, accuracy=0.382, cost=107, rouge_2=0.235] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 9, avg loss: 64.631575, avg accuracy: 0.599115
epoch: 9, avg loss test: 90.864960, avg accuracy test: 0.343545
epoch: 9, avg train rouge: 0.413573, avg test rouge: 0.232010


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.69it/s, accuracy=0.689, cost=47.7, rouge_2=0.467]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.61it/s, accuracy=0.392, cost=111, rouge_2=0.237] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 10, avg loss: 63.980870, avg accuracy: 0.615343
epoch: 10, avg loss test: 92.486121, avg accuracy test: 0.348880
epoch: 10, avg train rouge: 0.429714, avg test rouge: 0.235515


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.71it/s, accuracy=0.733, cost=48.3, rouge_2=0.534]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.96it/s, accuracy=0.373, cost=113, rouge_2=0.256] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 11, avg loss: 63.571819, avg accuracy: 0.625877
epoch: 11, avg loss test: 94.316892, avg accuracy test: 0.350494
epoch: 11, avg train rouge: 0.439594, avg test rouge: 0.237123


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.66it/s, accuracy=0.733, cost=47.7, rouge_2=0.534]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.48it/s, accuracy=0.412, cost=115, rouge_2=0.276] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 12, avg loss: 63.195373, avg accuracy: 0.636367
epoch: 12, avg loss test: 95.957268, avg accuracy test: 0.357102
epoch: 12, avg train rouge: 0.450115, avg test rouge: 0.241499


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.68it/s, accuracy=0.756, cost=47.2, rouge_2=0.534]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.54it/s, accuracy=0.402, cost=117, rouge_2=0.274] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 13, avg loss: 62.964358, avg accuracy: 0.643115
epoch: 13, avg loss test: 96.862583, avg accuracy test: 0.355776
epoch: 13, avg train rouge: 0.456431, avg test rouge: 0.243206


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.67it/s, accuracy=0.733, cost=48.2, rouge_2=0.534]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.50it/s, accuracy=0.392, cost=115, rouge_2=0.254] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 14, avg loss: 62.759829, avg accuracy: 0.649096
epoch: 14, avg loss test: 99.870215, avg accuracy test: 0.356450
epoch: 14, avg train rouge: 0.458547, avg test rouge: 0.242300


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.71it/s, accuracy=0.756, cost=47.2, rouge_2=0.539]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.96it/s, accuracy=0.412, cost=115, rouge_2=0.267] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 15, avg loss: 62.570542, avg accuracy: 0.654845
epoch: 15, avg loss test: 101.179394, avg accuracy test: 0.362391
epoch: 15, avg train rouge: 0.469231, avg test rouge: 0.247402


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.69it/s, accuracy=0.689, cost=50.6, rouge_2=0.494]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.52it/s, accuracy=0.402, cost=119, rouge_2=0.267] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 16, avg loss: 62.424470, avg accuracy: 0.659636
epoch: 16, avg loss test: 102.360128, avg accuracy test: 0.355939
epoch: 16, avg train rouge: 0.475005, avg test rouge: 0.244429


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.65it/s, accuracy=0.689, cost=49.4, rouge_2=0.444]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.95it/s, accuracy=0.412, cost=120, rouge_2=0.276] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 17, avg loss: 62.314220, avg accuracy: 0.662996
epoch: 17, avg loss test: 103.197917, avg accuracy test: 0.360235
epoch: 17, avg train rouge: 0.477931, avg test rouge: 0.246485


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.66it/s, accuracy=0.711, cost=48.6, rouge_2=0.469]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.97it/s, accuracy=0.392, cost=120, rouge_2=0.275] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 18, avg loss: 62.291030, avg accuracy: 0.664689
epoch: 18, avg loss test: 105.030493, avg accuracy test: 0.361679
epoch: 18, avg train rouge: 0.467598, avg test rouge: 0.248399


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.69it/s, accuracy=0.756, cost=46.4, rouge_2=0.544]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.50it/s, accuracy=0.422, cost=121, rouge_2=0.288] 

epoch: 19, avg loss: 62.148162, avg accuracy: 0.668568
epoch: 19, avg loss test: 105.179892, avg accuracy test: 0.363310
epoch: 19, avg train rouge: 0.469557, avg test rouge: 0.248378



