In [1]:
import json
import numpy as np
import tensorflow as tf
import collections
import os
from sklearn.cross_validation import train_test_split
from tensor2tensor.utils import beam_search, rouge



In [2]:
with open('news-30k.json') as fopen:
    news = json.load(fopen)
len(news)

29855

In [3]:
import malaya
import re
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [4]:
from tqdm import tqdm

min_len = 5
max_len = 500

x, y = [], []
for n in tqdm(news):
    if len(n['text'].split()) > min_len:
        p = preprocessing(n['text'])[:max_len]
        x.append(p)
        p = preprocessing(n['title'])
        y.append(p)

100%|██████████| 29855/29855 [00:44<00:00, 663.75it/s]


In [5]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [6]:
import itertools

concat = list(itertools.chain(*x))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 88005
Most common words [(',', 380933), ('.', 338805), ('yang', 158373), ('dan', 147862), ('di', 124501), ('-', 118778)]
Sample data [4340, 287, 1410, 343, 1606, 114, 3583, 4, 10, 4] ['waris', 'keluarga', 'allahyarham', 'muhammad', 'haziq', 'mohd', 'tarmizi', ',', '<NUM>', ',']
filtered vocab size: 88009
% of vocab used: 100.0%


In [7]:
for i in range(len(y)):
    y[i].append('EOS')

In [8]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [9]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k, UNK))
        X.append(ints)
    return X

In [10]:
X = str_idx(x, dictionary)
Y = str_idx(y, dictionary)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.1)

In [12]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Summarization:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 dict_size, learning_rate, 
                 kernel_size = 2, n_attn_heads = 16):

        self.X = tf.placeholder(tf.int32, [None, max_len])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        self.batch_size = batch_size
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        self.embedding = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        
        self.num_layers = num_layers
        self.kernel_size = kernel_size
        self.size_layer = size_layer
        self.n_attn_heads = n_attn_heads
        self.dict_size = dict_size
        
        self.training_logits, coverage_loss = self.forward(self.X, decoder_input)
        maxlen = tf.reduce_max(self.Y_seq_len)
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        
        targets = tf.slice(self.Y, [0, 0], [-1, maxlen])
        i1, i2 = tf.meshgrid(tf.range(batch_size),
                     tf.range(maxlen), indexing="ij")
        indices = tf.stack((i1,i2,targets),axis=2)
        probs = tf.gather_nd(self.training_logits, indices)
        probs = tf.where(tf.less_equal(probs,0),tf.ones_like(probs)*1e-10,probs)
        crossent = -tf.log(probs)
        self.cost = tf.reduce_sum(crossent * masks) / tf.to_float(batch_size)
        self.coverage_loss = tf.reduce_sum(coverage_loss / tf.to_float(batch_size))
        self.cost = self.cost + self.coverage_loss
    
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        
    def _calc_final_dist(self, x, gens, vocab_dists, attn_dists):
        with tf.variable_scope('final_distribution', reuse=tf.AUTO_REUSE):
            vocab_dists = gens * vocab_dists
            attn_dists = (1-gens) * attn_dists
            batch_size = tf.shape(attn_dists)[0]
            dec_t = tf.shape(attn_dists)[1]
            attn_len = tf.shape(attn_dists)[2]

            dec = tf.range(0, limit=dec_t) # [dec]
            dec = tf.expand_dims(dec, axis=-1) # [dec, 1]
            dec = tf.tile(dec, [1, attn_len]) # [dec, atten_len]
            dec = tf.expand_dims(dec, axis=0) # [1, dec, atten_len]
            dec = tf.tile(dec, [batch_size, 1, 1]) # [batch_size, dec, atten_len]

            x = tf.expand_dims(x, axis=1) # [batch_size, 1, atten_len]
            x = tf.tile(x, [1, dec_t, 1]) # [batch_size, dec, atten_len]
            x = tf.stack([dec, x], axis=3)

            attn_dists_projected = tf.map_fn(fn=lambda y: \
                                            tf.scatter_nd(y[0], y[1], [dec_t, self.dict_size]),
                                            elems=(x, attn_dists), dtype=tf.float32)

            final_dists = attn_dists_projected + vocab_dists
            return final_dists
    def forward(self, x, y, reuse = False):
        with tf.variable_scope('forward',reuse=reuse):
            with tf.variable_scope('forward',reuse=reuse):
                encoder_embedded = tf.nn.embedding_lookup(self.embedding, x)
                decoder_embedded = tf.nn.embedding_lookup(self.embedding, y)

                encoder_embedded += position_encoding(encoder_embedded)
                decoder_embedded += position_encoding(decoder_embedded)
                
                for i in range(self.num_layers): 
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate 
                    with tf.variable_scope('block_%d'%i,reuse=reuse):
                        encoder_embedded += cnn_block(encoder_embedded, dilation_rate, 
                                                      pad_sz, self.size_layer, self.kernel_size)
                
                g = tf.identity(decoder_embedded)
                dec = decoder_embedded
                attn_dists = []
                for i in range(self.num_layers):
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate
                    with tf.variable_scope('decode_%d'%i,reuse=reuse):
                        attn_res = h = cnn_block(dec, dilation_rate, 
                                                 pad_sz, self.size_layer, self.kernel_size)
                        C = []
                        for j in range(self.n_attn_heads):
                            h_ = tf.layers.dense(h, self.size_layer//self.n_attn_heads)
                            g_ = tf.layers.dense(g, self.size_layer//self.n_attn_heads)
                            zu_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)
                            ze_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)

                            d = tf.layers.dense(h_, self.size_layer//self.n_attn_heads) + g_
                            dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                            a = tf.nn.softmax(dz)
                            attn_dists.append(a)
                            c_ = tf.matmul(a, ze_)
                            C.append(c_)

                        c = tf.concat(C, 2)
                        h = tf.layers.dense(attn_res + c, self.size_layer)
                        dec += h
                

                weights = tf.transpose(self.embedding)
                logits = tf.einsum('ntd,dk->ntk', dec, weights)
                print(decoder_embedded, dec, attn_dists[-1])

                with tf.variable_scope("gen", reuse=tf.AUTO_REUSE):
                    gens = tf.layers.dense(tf.concat([decoder_embedded, dec, attn_dists[-1]], axis=-1), 
                                               units=1, activation=tf.sigmoid, use_bias=False)
                    
                logits = tf.nn.softmax(logits)
                
                print(gens)
                alignment_history = tf.transpose(attn_dists[-1],[1,2,0])
                coverage_loss = tf.minimum(alignment_history,tf.cumsum(alignment_history, axis=2, exclusive=True))
                        
                return self._calc_final_dist(x, gens, logits, attn_dists[-1]), coverage_loss

In [13]:
size_layer = 256
num_layers = 4
embedded_size = 256
learning_rate = 1e-3
batch_size = 16
epoch = 20

In [14]:
def beam_search_decoding(length = 20, beam_width = 5):
    initial_ids = tf.fill([model.batch_size], GO)
    
    def symbols_to_logits(ids):
        x = tf.contrib.seq2seq.tile_batch(model.X, beam_width)
        logits, _ = model.forward(x, ids, reuse = True)
        return logits[:, tf.shape(ids)[1]-1, :]

    final_ids, final_probs = beam_search.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_width,
        length,
        len(dictionary),
        0.0,
        eos_id = EOS)
    
    return final_ids

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Summarization(size_layer, num_layers, embedded_size, 
                      len(dictionary), learning_rate)
model.generate = beam_search_decoding()
sess.run(tf.global_variables_initializer())

Tensor("forward/forward/add_1:0", shape=(?, ?, 256), dtype=float32) Tensor("forward/forward/decode_3/add_19:0", shape=(?, ?, 256), dtype=float32) Tensor("forward/forward/decode_3/Reshape_31:0", shape=(?, ?, 500), dtype=float32)
Tensor("forward/forward/gen/dense/Sigmoid:0", shape=(?, ?, 1), dtype=float32)
Tensor("while/forward/forward/add_1:0", shape=(?, ?, 256), dtype=float32) Tensor("while/forward/forward/decode_3/add_19:0", shape=(?, ?, 256), dtype=float32) Tensor("while/forward/forward/decode_3/Reshape_31:0", shape=(?, ?, 500), dtype=float32)
Tensor("while/forward/forward/gen/dense/Sigmoid:0", shape=(?, ?, 1), dtype=float32)


In [16]:
def pad_sentence_batch(sentence_batch, pad_int, maxlen = None):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    if maxlen:
        max_sentence_len = maxlen
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [17]:
from keras.preprocessing import sequence

def calculate_rouges(predicted, batch_y):
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False)
    batch_y = np.array(batch_y)
    rouges = []
    for i in range(predicted.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        rouges.append(rouge.rouge_n([p], [a]))
    return np.mean(rouges)

Using TensorFlow backend.


In [18]:
from tqdm import tqdm
from sklearn.utils import shuffle
import time

for EPOCH in range(20):
    lasttime = time.time()
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    rouge_train, rouge_test = 0, 0
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(train_X))
        batch_x, _ = pad_sentence_batch(train_X[k: index], PAD, maxlen = max_len)
        batch_y, _ = pad_sentence_batch(train_Y[k: index], PAD)
        l, acc, loss, _ = sess.run([model.training_logits, model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += acc
        r = calculate_rouges(np.argmax(l, axis = 2), batch_y)
        rouge_train += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        index = min(k+batch_size,len(test_X))
        batch_x, _ = pad_sentence_batch(test_X[k: index], PAD, maxlen = max_len)
        batch_y, _ = pad_sentence_batch(test_Y[k: index], PAD)
        l, acc, loss = sess.run([model.training_logits, model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss_test += loss
        total_accuracy_test += acc
        r = calculate_rouges(np.argmax(l, axis = 2), batch_y)
        rouge_test += r
        pbar.set_postfix(cost=loss, accuracy = acc, rouge_2 = r)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
    rouge_train /= (len(train_X) / batch_size)
    rouge_test /= (len(test_X) / batch_size)
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))
    print('epoch: %d, avg train rouge: %f, avg test rouge: %f'%(EPOCH, rouge_train, rouge_test))

train minibatch loop: 100%|██████████| 1680/1680 [08:48<00:00,  3.43it/s, accuracy=0.213, cost=81.2, rouge_2=0.02]   
test minibatch loop: 100%|██████████| 187/187 [00:25<00:00,  7.28it/s, accuracy=0.346, cost=88.5, rouge_2=0.166] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 0, avg loss: 86.829018, avg accuracy: 0.223640
epoch: 0, avg loss test: 81.603329, avg accuracy test: 0.293185
epoch: 0, avg train rouge: 0.082813, avg test rouge: 0.123742


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.71it/s, accuracy=0.362, cost=74.3, rouge_2=0.141] 
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.10it/s, accuracy=0.374, cost=85, rouge_2=0.176]   
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 1, avg loss: 78.770819, avg accuracy: 0.330681
epoch: 1, avg loss test: 79.692556, avg accuracy test: 0.327089
epoch: 1, avg train rouge: 0.147743, avg test rouge: 0.148852


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.73it/s, accuracy=0.404, cost=69.3, rouge_2=0.187] 
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.18it/s, accuracy=0.383, cost=84.2, rouge_2=0.197] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 2, avg loss: 75.510154, avg accuracy: 0.382300
epoch: 2, avg loss test: 79.365448, avg accuracy test: 0.339094
epoch: 2, avg train rouge: 0.188584, avg test rouge: 0.162160


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.72it/s, accuracy=0.532, cost=64.3, rouge_2=0.285] 
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.21it/s, accuracy=0.355, cost=84.3, rouge_2=0.182] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 3, avg loss: 72.967544, avg accuracy: 0.427336
epoch: 3, avg loss test: 79.573478, avg accuracy test: 0.342755
epoch: 3, avg train rouge: 0.227354, avg test rouge: 0.165154


train minibatch loop: 100%|██████████| 1680/1680 [08:42<00:00,  3.72it/s, accuracy=0.617, cost=61.7, rouge_2=0.453]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  7.86it/s, accuracy=0.43, cost=84.3, rouge_2=0.296]  
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 4, avg loss: 70.829586, avg accuracy: 0.468290
epoch: 4, avg loss test: 80.188209, avg accuracy test: 0.351078
epoch: 4, avg train rouge: 0.265185, avg test rouge: 0.174457


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.69it/s, accuracy=0.617, cost=59.5, rouge_2=0.374]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.42it/s, accuracy=0.402, cost=85.6, rouge_2=0.293] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 5, avg loss: 69.022726, avg accuracy: 0.505547
epoch: 5, avg loss test: 81.830540, avg accuracy test: 0.349842
epoch: 5, avg train rouge: 0.301974, avg test rouge: 0.176156


train minibatch loop: 100%|██████████| 1680/1680 [08:47<00:00,  3.55it/s, accuracy=0.66, cost=58.1, rouge_2=0.567] 
test minibatch loop: 100%|██████████| 187/187 [00:24<00:00,  8.24it/s, accuracy=0.411, cost=86.4, rouge_2=0.3]   
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 6, avg loss: 67.560988, avg accuracy: 0.536813
epoch: 6, avg loss test: 83.682662, avg accuracy test: 0.349611
epoch: 6, avg train rouge: 0.334654, avg test rouge: 0.175757


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.72it/s, accuracy=0.617, cost=58.5, rouge_2=0.54] 
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.27it/s, accuracy=0.411, cost=89, rouge_2=0.264]   
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 7, avg loss: 66.444648, avg accuracy: 0.559103
epoch: 7, avg loss test: 85.841804, avg accuracy test: 0.350088
epoch: 7, avg train rouge: 0.359067, avg test rouge: 0.180485


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.68it/s, accuracy=0.681, cost=56.9, rouge_2=0.575]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.17it/s, accuracy=0.421, cost=89.8, rouge_2=0.256] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 8, avg loss: 65.548218, avg accuracy: 0.577540
epoch: 8, avg loss test: 89.298378, avg accuracy test: 0.347146
epoch: 8, avg train rouge: 0.379051, avg test rouge: 0.178160


train minibatch loop: 100%|██████████| 1680/1680 [08:45<00:00,  3.70it/s, accuracy=0.681, cost=56.6, rouge_2=0.564]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.16it/s, accuracy=0.439, cost=91.7, rouge_2=0.3]   
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 9, avg loss: 64.750835, avg accuracy: 0.597003
epoch: 9, avg loss test: 93.069645, avg accuracy test: 0.342648
epoch: 9, avg train rouge: 0.401103, avg test rouge: 0.174838


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.71it/s, accuracy=0.702, cost=57.3, rouge_2=0.565]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.26it/s, accuracy=0.458, cost=92.2, rouge_2=0.308] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 10, avg loss: 64.143075, avg accuracy: 0.611441
epoch: 10, avg loss test: 93.377828, avg accuracy test: 0.354337
epoch: 10, avg train rouge: 0.418062, avg test rouge: 0.183072


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.71it/s, accuracy=0.702, cost=57.3, rouge_2=0.611]
test minibatch loop: 100%|██████████| 187/187 [00:24<00:00,  8.24it/s, accuracy=0.439, cost=93.4, rouge_2=0.256] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 11, avg loss: 63.765210, avg accuracy: 0.622152
epoch: 11, avg loss test: 93.808084, avg accuracy test: 0.361156
epoch: 11, avg train rouge: 0.431536, avg test rouge: 0.190881


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.70it/s, accuracy=0.66, cost=57, rouge_2=0.526]   
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.21it/s, accuracy=0.411, cost=95.8, rouge_2=0.263] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 12, avg loss: 63.428511, avg accuracy: 0.631011
epoch: 12, avg loss test: 94.839856, avg accuracy test: 0.361556
epoch: 12, avg train rouge: 0.441038, avg test rouge: 0.191395


train minibatch loop: 100%|██████████| 1680/1680 [08:45<00:00,  3.03it/s, accuracy=0.681, cost=57.9, rouge_2=0.539]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.23it/s, accuracy=0.411, cost=96.7, rouge_2=0.22]  
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 13, avg loss: 63.061756, avg accuracy: 0.641151
epoch: 13, avg loss test: 96.560659, avg accuracy test: 0.365448
epoch: 13, avg train rouge: 0.454488, avg test rouge: 0.196725


train minibatch loop: 100%|██████████| 1680/1680 [08:45<00:00,  3.71it/s, accuracy=0.723, cost=56.3, rouge_2=0.589]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.33it/s, accuracy=0.43, cost=98, rouge_2=0.253]    
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 14, avg loss: 62.845083, avg accuracy: 0.647681
epoch: 14, avg loss test: 97.446881, avg accuracy test: 0.368409
epoch: 14, avg train rouge: 0.461730, avg test rouge: 0.197676


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.71it/s, accuracy=0.702, cost=56.5, rouge_2=0.598]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.30it/s, accuracy=0.43, cost=96.1, rouge_2=0.272]  
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 15, avg loss: 62.727321, avg accuracy: 0.651722
epoch: 15, avg loss test: 98.739463, avg accuracy test: 0.370453
epoch: 15, avg train rouge: 0.467604, avg test rouge: 0.198838


train minibatch loop: 100%|██████████| 1680/1680 [08:44<00:00,  3.74it/s, accuracy=0.66, cost=58.7, rouge_2=0.512] 
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.27it/s, accuracy=0.467, cost=98.2, rouge_2=0.314] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 16, avg loss: 62.539400, avg accuracy: 0.656702
epoch: 16, avg loss test: 100.323339, avg accuracy test: 0.368305
epoch: 16, avg train rouge: 0.473780, avg test rouge: 0.197988


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.72it/s, accuracy=0.681, cost=57.1, rouge_2=0.524]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.12it/s, accuracy=0.421, cost=98.3, rouge_2=0.287] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 17, avg loss: 62.456428, avg accuracy: 0.659745
epoch: 17, avg loss test: 101.080136, avg accuracy test: 0.371894
epoch: 17, avg train rouge: 0.478182, avg test rouge: 0.200594


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.70it/s, accuracy=0.702, cost=56.8, rouge_2=0.6]  
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.26it/s, accuracy=0.421, cost=104, rouge_2=0.262] 
train minibatch loop:   0%|          | 0/1680 [00:00<?, ?it/s]

epoch: 18, avg loss: 62.353644, avg accuracy: 0.663655
epoch: 18, avg loss test: 102.636975, avg accuracy test: 0.368670
epoch: 18, avg train rouge: 0.483276, avg test rouge: 0.199139


train minibatch loop: 100%|██████████| 1680/1680 [08:43<00:00,  3.71it/s, accuracy=0.702, cost=55.8, rouge_2=0.552]
test minibatch loop: 100%|██████████| 187/187 [00:23<00:00,  8.27it/s, accuracy=0.477, cost=98.4, rouge_2=0.296] 

epoch: 19, avg loss: 62.269757, avg accuracy: 0.665926
epoch: 19, avg loss test: 103.128115, avg accuracy test: 0.371985
epoch: 19, avg train rouge: 0.486146, avg test rouge: 0.203823



