In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Package untuk skor: Rouge, Bert_Score
!pip install rouge
!pip install bert_score

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0
Collecting bert_score
[?25l  Downloading https://files.pythonhosted.org/packages/38/fb/e63e7e231a79db0489dbf7e7d0ebfb279ccb3d8216aa0d133572f784f3fa/bert_score-0.3.9-py3-none-any.whl (59kB)
[K     |████████████████████████████████| 61kB 4.5MB/s 
Collecting transformers>=3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 10.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading h

In [3]:
import random
import queue as Queue
import time
from random import shuffle
from threading import Thread

import numpy as np
import pandas as pd

import tensorflow as tf

import torch as T
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

import os

import shutil
import collections

import gc
from rouge import Rouge
import bert_score
import argparse

## Vocab

In [5]:
# Pembuatan Vocab
SENTENCE_START = '<s>'      
SENTENCE_END = '</s>'       

PAD_TOKEN = '[PAD]'         
UNKNOWN_TOKEN = '[UNK]'     
START_DECODING = '[START]'  
STOP_DECODING = '[STOP]'   

class Vocab(object):
  def __init__(self, vocab_file, max_size):

    self._word_to_id = {}                                                      
    self._id_to_word = {}                                                    
    self._count = 0                                                             

    for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
      self._word_to_id[w] = self._count                                        
      self._id_to_word[self._count] = w     
      self._count += 1

    with open(vocab_file, 'r') as vocab_f:
      for line in vocab_f:
        pieces = line.split()                                                  
        if len(pieces) != 2:
          print('Format vocab salah: %s\n' % line)
          continue
        w = pieces[0]                                                     
        if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:       
          raise Exception('%s tidak boleh berada di dalam file vocab' % w)
        if w in self._word_to_id:                                             
          raise Exception('Duplicated word: %s' % w)       
        self._word_to_id[w] = self._count                                      
        self._id_to_word[self._count] = w                                       
        self._count += 1                                                      
        if max_size != 0 and self._count >= max_size:                          
          print("max_size vocab: %i; %i words. Stopping reading." % (max_size, self._count))
          break
    print("Finish %i total words. Last word: %s" % (self._count, self._id_to_word[self._count-1]))

  def word2id(self, word):
    if word not in self._word_to_id:
      return self._word_to_id[UNKNOWN_TOKEN]
    return self._word_to_id[word]

  def id2word(self, word_id):
    if word_id not in self._id_to_word:
      raise ValueError('Id not found in vocab: %d' % word_id)
    return self._id_to_word[word_id]

  def size(self):
    return self._count


def example_generator(data_path, single_pass):
  #Generates tf.Examples from data files.
  while True:
    filelist = glob.glob(data_path) 
    assert filelist, ('Error: Empty filelist at %s' % data_path)
    if single_pass:
      filelist = sorted(filelist)
    else:
      random.shuffle(filelist)
    for f in filelist:
      reader = open(f, 'rb')
      while True:
        len_bytes = reader.read(8)
        if not len_bytes: break # finished reading this file
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        yield example_pb2.Example.FromString(example_str)
    if single_pass:
      print("example_generator completed reading all datafiles. No more data.")
      break

def article2ids(article_words, vocab):
  #Map the article words to their ids. Also return a list of OOVs in the article.
  ids = []
  oovs = []
  unk_id = vocab.word2id(UNKNOWN_TOKEN)
  for w in article_words:
    i = vocab.word2id(w)
    if i == unk_id:
      if w not in oovs:
        oovs.append(w)
      oov_num = oovs.index(w)
      ids.append(vocab.size() + oov_num)
    else:
      ids.append(i)
  return ids, oovs

def abstract2ids(abstract_words, vocab, article_oovs):
  #Map the abstract words to their ids. In-article OOVs are mapped to their temporary OOV numbers.
  ids = []
  unk_id = vocab.word2id(UNKNOWN_TOKEN)
  for w in abstract_words:
    i = vocab.word2id(w)
    if i == unk_id:
      if w in article_oovs:
        vocab_idx = vocab.size() + article_oovs.index(w)
        ids.append(vocab_idx)
      else:
        ids.append(unk_id)
    else:
      ids.append(i)
  return ids


def outputids2words(id_list, vocab, article_oovs):
  #Maps output ids to words, including mapping in-article OOVs from their temporary ids to the original OOV string (applicable in pointer-generator mode).
  words = []
  for i in id_list:
    try:
      w = vocab.id2word(i) # might be [UNK]
    except ValueError as e: # w is OOV
      assert article_oovs is not None, "Error: model produced a word ID that isn't in the vocabulary. This should not happen in baseline (no pointer-generator) mode"
      article_oov_idx = i - vocab.size()
      try:
        w = article_oovs[article_oov_idx]
      except ValueError as e: # i doesn't correspond to an article oov
        raise ValueError('Error: model produced word ID %i which corresponds to article OOV %i but this example only has %i article OOVs' % (i, article_oov_idx, len(article_oovs)))
    words.append(w) 
  return words


def abstract2sents(abstract):
  #Splits abstract text from datafile into list of sentences.
  cur = 0
  sents = []
  while True:
    try:
      start_p = abstract.index(SENTENCE_START, cur)
      end_p = abstract.index(SENTENCE_END, start_p + 1)
      cur = end_p + len(SENTENCE_END)
      sents.append(abstract[start_p+len(SENTENCE_START):end_p])
    except ValueError as e: # no more sentences
      return sents

## Batcher

In [7]:
#Set Batcher
max_batch_queue = 1000
random.seed(2021)

In [8]:
class Example(object):
  #read article, abstrction and vocab and process them into batching-ready format

  def __init__(self, article, abstract_sentences, vocab):
    start_decoding = vocab.word2id(START_DECODING)
    stop_decoding = vocab.word2id(STOP_DECODING) 

    # Process article
    article_words = article.split()
    if len(article_words) > max_enc_steps:             
      article_words = article_words[ : max_enc_steps]   
    self.enc_len = len(article_words)                  
    self.enc_input = [vocab.word2id(w) for w in article_words]

    # Process abstract
    abstract = ' '.join(abstract_sentences)              
    abstract_words = abstract.split()                      
    abs_ids = [vocab.word2id(w) for w in abstract_words]  

    # decoder input target sequence
    self.dec_input, _ = self.get_dec_inp_targ_seqs(abs_ids, max_dec_steps, start_decoding, stop_decoding)
    self.dec_len = len(self.dec_input)

    #pointer
    self.enc_input_extend_vocab, self.article_oovs = article2ids(article_words, vocab)

    abs_ids_extend_vocab = abstract2ids(abstract_words, vocab, self.article_oovs)   

    # decoder target sequence
    _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, max_dec_steps, start_decoding, stop_decoding)

    # Store the original strings
    self.original_article = article
    self.original_abstract = abstract
    self.original_abstract_sents = abstract_sentences

  def get_dec_inp_targ_seqs(self, sequence, max_len, start_id, stop_id):
    inp = [start_id] + sequence[:]
    target = sequence[:]
    if len(inp) > max_len: # truncate
      inp = inp[:max_len]
      target = target[:max_len] # no end_token
    else: # no truncation
      target.append(stop_id) # end token
    assert len(inp) == len(target)
    return inp, target

  #padding
  def pad_decoder_inp_targ(self, max_len, pad_id):
    while len(self.dec_input) < max_len:
      self.dec_input.append(pad_id)
    while len(self.target) < max_len:
      self.target.append(pad_id)

  #padding
  def pad_encoder_input(self, max_len, pad_id):
    while len(self.enc_input) < max_len:
      self.enc_input.append(pad_id)
    while len(self.enc_input_extend_vocab) < max_len:
      self.enc_input_extend_vocab.append(pad_id)

In [9]:
class Batch(object):
  def __init__(self, example_list, vocab, batch_size):
    self.batch_size = batch_size
    self.pad_id = vocab.word2id(PAD_TOKEN)
    self.init_encoder_seq(example_list)
    self.init_decoder_seq(example_list)
    self.store_orig_strings(example_list)

  def init_encoder_seq(self, example_list):
    # Determine the maximum length of the encoder input sequence in this batch
    max_enc_seq_len = max([ex.enc_len for ex in example_list])

    # Pad the encoder input sequences up to the length of the longest sequence
    for ex in example_list:
      ex.pad_encoder_input(max_enc_seq_len, self.pad_id)

    # Initialize the numpy arrays
    self.enc_batch = np.zeros((self.batch_size, max_enc_seq_len), dtype=np.int32)
    self.enc_lens = np.zeros((self.batch_size), dtype=np.int32)
    self.enc_padding_mask = np.zeros((self.batch_size, max_enc_seq_len), dtype=np.float32)

    # Fill in the numpy arrays
    for i, ex in enumerate(example_list):
      self.enc_batch[i, :] = ex.enc_input[:]
      self.enc_lens[i] = ex.enc_len
      for j in range(ex.enc_len):
        self.enc_padding_mask[i][j] = 1

    # Determine the max number of in-article OOVs in this batch
    self.max_art_oovs = max([len(ex.article_oovs) for ex in example_list])
    # Store the in-article OOVs themselves
    self.art_oovs = [ex.article_oovs for ex in example_list]
    # Store the version of the enc_batch that uses the article OOV ids
    self.enc_batch_extend_vocab = np.zeros((self.batch_size, max_enc_seq_len), dtype=np.int32)
    for i, ex in enumerate(example_list):
      self.enc_batch_extend_vocab[i, :] = ex.enc_input_extend_vocab[:]

  def init_decoder_seq(self, example_list):
    # Pad the inputs and targets
    for ex in example_list:
      ex.pad_decoder_inp_targ(max_dec_steps, self.pad_id)

    # Initialize the numpy arrays.
    self.dec_batch = np.zeros((self.batch_size, max_dec_steps), dtype=np.int32)
    self.target_batch = np.zeros((self.batch_size, max_dec_steps), dtype=np.int32)
    self.dec_lens = np.zeros((self.batch_size), dtype=np.int32)

    # Fill in the numpy arrays
    for i, ex in enumerate(example_list):
      self.dec_batch[i, :] = ex.dec_input[:]
      self.target_batch[i, :] = ex.target[:]
      self.dec_lens[i] = ex.dec_lens

  def store_orig_strings(self, example_list):
    self.original_articles = [ex.original_article for ex in example_list]
    self.original_abstracts = [ex.original_abstract for ex in example_list]
    self.original_abstracts_sents = [ex.original_abstract_sents for ex in example_list]

In [10]:
class Batcher(object):
  BATCH_QUEUE_MAX = max_batch_queue

  def __init__(self, data_path, vocab, mode, batch_size, single_pass):
    self._data_path = data_path
    self._vocab = vocab
    self._single_pass = single_pass
    self.mode = mode
    self.batch_size = batch_size
    # Initialize a queue of Batches waiting to be used, and a queue of Examples waiting to be batched
    self._batch_queue = Queue.Queue(self.BATCH_QUEUE_MAX)
    self._example_queue = Queue.Queue(self.BATCH_QUEUE_MAX * self.batch_size)

    # Different settings depending on whether we're in single_pass mode or not
    if single_pass:
      self._num_example_q_threads = 1 # just one thread, read through the dataset just once
      self._num_batch_q_threads = 1  # just one thread to batch examples
      self._bucketing_cache_size = 1 # only load one batch's worth of examples before bucketing; this essentially means no bucketing
      self._finished_reading = False # this will tell us when we're finished reading the dataset
    else:
      self._num_example_q_threads = 16
      self._num_batch_q_threads = 4 
      self._bucketing_cache_size = 100

    # Start the threads that load the queues
    self._example_q_threads = []
    for _ in range(self._num_example_q_threads):
      self._example_q_threads.append(Thread(target=self.fill_example_queue))
      self._example_q_threads[-1].daemon = True
      self._example_q_threads[-1].start()
    self._batch_q_threads = []
    for _ in range(self._num_batch_q_threads):
      self._batch_q_threads.append(Thread(target=self.fill_batch_queue))
      self._batch_q_threads[-1].daemon = True
      self._batch_q_threads[-1].start()

    # Start a thread that watches the other threads and restarts them if they're dead
    if not single_pass:
      self._watch_thread = Thread(target=self.watch_threads)
      self._watch_thread.daemon = True
      self._watch_thread.start()

  def next_batch(self):
    # If the batch queue is empty, print a warning
    if self._batch_queue.qsize() == 0:
      if self._single_pass and self._finished_reading:
        tf.compat.v1.logging.info("Finished reading dataset in single_pass mode.")
        return None

    batch = self._batch_queue.get() # get the next Batch
    return batch

  def fill_example_queue(self):
    input_gen = self.text_generator(example_generator(self._data_path, self._single_pass))

    while True:
      try:
        (article, abstract) = next(input_gen) # read the next example from file. article and abstract
      except StopIteration:
        tf.compat.v1.logging.info("The example generator for this example queue filling thread has exhausted data.")
        if self._single_pass:
          tf.compat.v1.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
          self._finished_reading = True
          break
        else:
          raise Exception("single_pass mode is off but the example generator is out of data; error.")

      abstract_sentences = [abstract.strip()]
      example = Example(article, abstract_sentences, self._vocab) 
      self._example_queue.put(example)

  def fill_batch_queue(self):
    while True:
      if self.mode == 'decode':
        ex = self._example_queue.get()
        b = [ex for _ in range(self.batch_size)]
        self._batch_queue.put(Batch(b, self._vocab, self.batch_size))
      else:
        inputs = []
        for _ in range(self.batch_size * self._bucketing_cache_size):
          inputs.append(self._example_queue.get())
        inputs = sorted(inputs, key=lambda inp: inp.enc_len, reverse=True)

        batches = []
        for i in range(0, len(inputs), self.batch_size):
          batches.append(inputs[i:i + self.batch_size])
        if not self._single_pass:
          shuffle(batches)
        for b in batches:
          self._batch_queue.put(Batch(b, self._vocab, self.batch_size))

  def watch_threads(self):
    while True:
      tf.compat.v1.logging.info(
        'Bucket queue size: %i, Input queue size: %i',
        self._batch_queue.qsize(), self._example_queue.qsize())

      time.sleep(60)
      for idx,t in enumerate(self._example_q_threads):
        if not t.is_alive():
          tf.compat.v1.logging.error('Found example queue thread dead. Restarting.')
          new_t = Thread(target=self.fill_example_queue)
          self._example_q_threads[idx] = new_t
          new_t.daemon = True
          new_t.start()
      for idx,t in enumerate(self._batch_q_threads):
        if not t.is_alive(): 
          tf.compat.v1.logging.error('Found batch queue thread dead. Restarting.')
          new_t = Thread(target=self.fill_batch_queue)
          self._batch_q_threads[idx] = new_t
          new_t.daemon = True
          new_t.start()

  def text_generator(self, example_generator):
    while True:
      e = next(example_generator)
      try:
        article_text = e.features.feature['article'].bytes_list.value[0] 
        abstract_text = e.features.feature['abstract'].bytes_list.value[0]
        article_text = article_text.decode()
        abstract_text = abstract_text.decode()
      except ValueError:
        tf.compat.v1.logging.error('Failed to get article or abstract from example')
        continue
      if len(article_text)==0:
        continue
      else:
        yield (article_text, abstract_text)

## Model

In [12]:
def init_lstm_wt(lstm):     
    for name, _ in lstm.named_parameters():
        if 'weight' in name:
            wt = getattr(lstm, name)
            wt.data = wt.data.uniform_(-rand_unif_init_mag, rand_unif_init_mag)
        elif 'bias' in name:
            bias = getattr(lstm, name)
            n = bias.size(0)
            start, end = n // 4, n // 2
            bias.data = bias.data.fill_(0.)
            bias.data[start:end].fill_(1.)

def init_linear_wt(linear): 
    linear.weight.data = linear.weight.data.normal_(std=trunc_norm_init_std)
    if linear.bias is not None:
        linear.bias.data = linear.bias.data.normal_(std=trunc_norm_init_std)

def init_wt_normal(wt):
    wt.data = wt.data.normal_(std=trunc_norm_init_std)

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=1, batch_first=True, bidirectional=True) 
        init_lstm_wt(self.lstm)

        self.reduce_h = nn.Linear(hidden_dim * 2, hidden_dim)
        init_linear_wt(self.reduce_h)
        self.reduce_c = nn.Linear(hidden_dim * 2, hidden_dim)
        init_linear_wt(self.reduce_c)

    def forward(self, x, seq_lens):
        packed = pack_padded_sequence(x, seq_lens, batch_first=True)
        enc_out, enc_hid = self.lstm(packed)                           
        enc_out,_ = pad_packed_sequence(enc_out, batch_first=True)      
        enc_out = enc_out.contiguous()                             
        h, c = enc_hid                                           
        h = T.cat(list(h), dim=1)                               
        c = T.cat(list(c), dim=1)
        h_reduced = F.relu(self.reduce_h(h))                     
        c_reduced = F.relu(self.reduce_c(c))
        return enc_out, (h_reduced, c_reduced)    

class encoder_attention(nn.Module):
    def __init__(self):
        super(encoder_attention, self).__init__()
        self.W_h = nn.Linear(hidden_dim * 2, hidden_dim * 2, bias=False)   
        self.W_s = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.v = nn.Linear(hidden_dim * 2, 1, bias=False)

    def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs):           
        et = self.W_h(h)
        dec_fea = self.W_s(st_hat).unsqueeze(1)
        et = et + dec_fea
        et = T.tanh(et)
        et = self.v(et).squeeze(2)

        if intra_encoder:
            exp_et = T.exp(et)
            if sum_temporal_srcs is None:
                et1 = exp_et
                sum_temporal_srcs  = get_cuda(T.FloatTensor(et.size()).fill_(1e-10)) + exp_et      
            else:
                et1 = exp_et/sum_temporal_srcs
                sum_temporal_srcs = sum_temporal_srcs + exp_et
        else:
            et1 = F.softmax(et, dim=1)

        at = et1 * enc_padding_mask
        normalization_factor = at.sum(1, keepdim=True)
        at = at / normalization_factor

        at = at.unsqueeze(1)          
        # encoder context vector
        ct_e = T.bmm(at, h)                   
        ct_e = ct_e.squeeze(1)
        at = at.squeeze(1)

        return ct_e, at, sum_temporal_srcs

class decoder_attention(nn.Module):
    def __init__(self):
        super(decoder_attention, self).__init__()
        if intra_decoder:
            self.W_prev = nn.Linear(hidden_dim, hidden_dim, bias=False)        #weight
            self.W_s = nn.Linear(hidden_dim, hidden_dim)
            self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, s_t, prev_s):
        #intra_decoder attention
        if intra_decoder is False:
            ct_d = get_cuda(T.zeros(s_t.size()))
        elif prev_s is None:
            ct_d = get_cuda(T.zeros(s_t.size()))
            prev_s = s_t.unsqueeze(1)
        else:
            et = self.W_prev(prev_s)
            dec_fea = self.W_s(s_t).unsqueeze(1)
            et = et + dec_fea
            et = T.tanh(et)
            et = self.v(et).squeeze(2)

            at = F.softmax(et, dim=1).unsqueeze(1)
            ct_d = T.bmm(at, prev_s).squeeze(1)
            prev_s = T.cat([prev_s, s_t.unsqueeze(1)], dim=1)

        #decoder context vector, previous decoder hidden states
        return ct_d, prev_s 

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.enc_attention = encoder_attention()
        self.dec_attention = decoder_attention()
        self.x_context = nn.Linear(hidden_dim*2 + emb_dim, emb_dim)

        self.lstm = nn.LSTMCell(emb_dim, hidden_dim)
        init_lstm_wt(self.lstm)

        self.p_gen_linear = nn.Linear(hidden_dim * 5 + emb_dim, 1)

        self.V = nn.Linear(hidden_dim*4, hidden_dim)
        self.V1 = nn.Linear(hidden_dim, vocab_size)
        init_linear_wt(self.V1)

    def forward(self, x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s):
        x = self.x_context(T.cat([x_t, ct_e], dim=1))
        s_t = self.lstm(x, s_t)

        dec_h, dec_c = s_t
        st_hat = T.cat([dec_h, dec_c], dim=1)
        ct_e, attn_dist, sum_temporal_srcs = self.enc_attention(st_hat, enc_out, enc_padding_mask, sum_temporal_srcs)

        #intra-decoder attention
        ct_d, prev_s = self.dec_attention(dec_h, prev_s)

        p_gen = T.cat([ct_e, ct_d, st_hat, x], 1)
        p_gen = self.p_gen_linear(p_gen)
        p_gen = T.sigmoid(p_gen)

        out = T.cat([dec_h, ct_e, ct_d], dim=1)
        out = self.V(out)
        out = self.V1(out)
        vocab_dist = F.softmax(out, dim=1)
        vocab_dist = p_gen * vocab_dist
        attn_dist_ = (1 - p_gen) * attn_dist

        # pointer mechanism 
        if extra_zeros is not None:
            vocab_dist = T.cat([vocab_dist, extra_zeros], dim=1)
        final_dist = vocab_dist.scatter_add(1, enc_batch_extend_vocab, attn_dist_)

        return final_dist, s_t, ct_e, sum_temporal_srcs, prev_s

# Model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.embeds = nn.Embedding(vocab_size, emb_dim)
        init_wt_normal(self.embeds.weight)

        self.encoder = get_cuda(self.encoder)
        self.decoder = get_cuda(self.decoder)
        self.embeds = get_cuda(self.embeds)

## Helpers

In [13]:
def get_cuda(tensor):
    if T.cuda.is_available():
        tensor = tensor.cuda()
    return tensor

def get_enc_data(batch):
    batch_size = len(batch.enc_lens)
    enc_batch = T.from_numpy(batch.enc_batch).long()
    enc_padding_mask = T.from_numpy(batch.enc_padding_mask).float()

    enc_lens = batch.enc_lens
    ct_e = T.zeros(batch_size, 2*   hidden_dim)   #config.hidden_dim

    enc_batch = get_cuda(enc_batch)
    enc_padding_mask = get_cuda(enc_padding_mask)
    ct_e = get_cuda(ct_e)

    enc_batch_extend_vocab = None
    if batch.enc_batch_extend_vocab is not None:
        enc_batch_extend_vocab = T.from_numpy(batch.enc_batch_extend_vocab).long()
        enc_batch_extend_vocab = get_cuda(enc_batch_extend_vocab)

    extra_zeros = None
    if batch.max_art_oovs > 0:
        extra_zeros = T.zeros(batch_size, batch.max_art_oovs)
        extra_zeros = get_cuda(extra_zeros)

    return enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e


def get_dec_data(batch):
    dec_batch = T.from_numpy(batch.dec_batch).long()
    dec_lens = batch.dec_lens
    max_dec_len = np.max(dec_lens)
    dec_lens = T.from_numpy(batch.dec_lens).float()

    target_batch = T.from_numpy(batch.target_batch).long()
    dec_batch = get_cuda(dec_batch)
    dec_lens = get_cuda(dec_lens)
    target_batch = get_cuda(target_batch)

    return dec_batch, max_dec_len, dec_lens, target_batch

## Beam Search

In [14]:
#Beam Search
class Beam(object):
    def __init__(self, start_id, end_id, unk_id, hidden_state, context):
        h,c = hidden_state
        self.tokens = T.LongTensor(beam_size,1).fill_(start_id)
        self.scores = T.FloatTensor(beam_size,1).fill_(-30)
        self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores)
        self.scores[0][0] = 0
        self.hid_h = h.unsqueeze(0).repeat(beam_size, 1)
        self.hid_c = c.unsqueeze(0).repeat(beam_size, 1)
        self.context = context.unsqueeze(0).repeat(beam_size, 1)
        self.sum_temporal_srcs = None
        self.prev_s = None
        self.done = False
        self.end_id = end_id
        self.unk_id = unk_id

    def get_current_state(self):
        tokens = self.tokens[:,-1].clone()
        for i in range(len(tokens)):
            if tokens[i].item() >= vocab_size:
                tokens[i] = self.unk_id
        return tokens

    def advance(self, prob_dist, hidden_state, context, sum_temporal_srcs, prev_s):
        #Run beam search
        n_extended_vocab = prob_dist.size(1)
        h, c = hidden_state
        log_probs = T.log(prob_dist+eps)

        scores = log_probs + self.scores
        scores = scores.view(-1,1)
        best_scores, best_scores_id = T.topk(input=scores, k=beam_size, dim=0)
        self.scores = best_scores
        beams_order = best_scores_id.squeeze(1)//n_extended_vocab
        best_words = best_scores_id%n_extended_vocab
        self.hid_h = h[beams_order]
        self.hid_c = c[beams_order]
        self.context = context[beams_order]
        if sum_temporal_srcs is not None:
            self.sum_temporal_srcs = sum_temporal_srcs[beams_order]
        if prev_s is not None:
            self.prev_s = prev_s[beams_order]
        self.tokens = self.tokens[beams_order]
        self.tokens = T.cat([self.tokens, best_words], dim=1)

        #End condition is when top-of-beam is EOS.
        if best_words[0][0] == self.end_id:
            self.done = True

    def get_best(self):
        best_token = self.tokens[0].cpu().numpy().tolist()
        try:
            end_idx = best_token.index(self.end_id)
        except ValueError:
            end_idx = len(best_token)
        best_token = best_token[1:end_idx]
        return best_token

    def get_all(self):
        all_tokens = []
        for i in range(len(self.tokens)):
            all_tokens.append(self.tokens[i].cpu().numpy())
        return all_tokens

def beam_search(enc_hid, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, model, start_id, end_id, unk_id):
    batch_size = len(enc_hid[0])
    beam_idx = T.LongTensor(list(range(batch_size)))
    #For each example in batch, create Beam object
    beams = [Beam(start_id, end_id, unk_id, (enc_hid[0][i], enc_hid[1][i]), ct_e[i]) for i in range(batch_size)]
    n_rem = batch_size
    sum_temporal_srcs = None
    prev_s = None

    for t in range(max_dec_steps):
        x_t = T.stack(
            [beam.get_current_state() for beam in beams if beam.done == False]
        ).contiguous().view(-1)
        x_t = model.embeds(x_t)

        dec_h = T.stack(
            [beam.hid_h for beam in beams if beam.done == False]
        ).contiguous().view(-1,hidden_dim)
        dec_c = T.stack(
            [beam.hid_c for beam in beams if beam.done == False]
        ).contiguous().view(-1,hidden_dim)

        ct_e = T.stack(
            [beam.context for beam in beams if beam.done == False]
        ).contiguous().view(-1,2*hidden_dim)

        if sum_temporal_srcs is not None:
            sum_temporal_srcs = T.stack(
                [beam.sum_temporal_srcs for beam in beams if beam.done == False]
            ).contiguous().view(-1, enc_out.size(1))

        if prev_s is not None:
            prev_s = T.stack(
                [beam.prev_s for beam in beams if beam.done == False]
            ).contiguous().view(-1, t, hidden_dim)

        s_t = (dec_h, dec_c)
        enc_out_beam = enc_out[beam_idx].view(n_rem,-1).repeat(1, beam_size).view(-1, enc_out.size(1), enc_out.size(2))
        enc_pad_mask_beam = enc_padding_mask[beam_idx].repeat(1, beam_size).view(-1, enc_padding_mask.size(1))

        extra_zeros_beam = None
        if extra_zeros is not None:
            extra_zeros_beam = extra_zeros[beam_idx].repeat(1, beam_size).view(-1, extra_zeros.size(1))
        enc_extend_vocab_beam = enc_batch_extend_vocab[beam_idx].repeat(1, beam_size).view(-1, enc_batch_extend_vocab.size(1))

        final_dist, (dec_h, dec_c), ct_e, sum_temporal_srcs, prev_s = model.decoder(x_t, s_t, enc_out_beam, enc_pad_mask_beam, ct_e, extra_zeros_beam, enc_extend_vocab_beam, sum_temporal_srcs, prev_s)

        final_dist = final_dist.view(n_rem, beam_size, -1)
        dec_h = dec_h.view(n_rem, beam_size, -1)
        dec_c = dec_c.view(n_rem, beam_size, -1)
        ct_e = ct_e.view(n_rem, beam_size, -1)

        if sum_temporal_srcs is not None:
            sum_temporal_srcs = sum_temporal_srcs.view(n_rem, beam_size, -1)

        if prev_s is not None:
            prev_s = prev_s.view(n_rem, beam_size, -1, hidden_dim)

        # For all the active beams, perform beam search
        active = []
        for i in range(n_rem):
            b = beam_idx[i].item()
            beam = beams[b]
            if beam.done:
                continue

            sum_temporal_srcs_i = prev_s_i = None
            if sum_temporal_srcs is not None:
                sum_temporal_srcs_i = sum_temporal_srcs[i]
            if prev_s is not None:
                prev_s_i = prev_s[i]
            beam.advance(final_dist[i], (dec_h[i], dec_c[i]), ct_e[i], sum_temporal_srcs_i, prev_s_i)
            if beam.done == False:
                active.append(b)

        if len(active) == 0:
            break

        beam_idx = T.LongTensor(active)
        n_rem = len(beam_idx)

    predicted_words = []
    for beam in beams:
        predicted_words.append(beam.get_best())

    return predicted_words

## Evaluation -  Rouge

In [15]:
#Evaluation with Rouge
def get_cuda(tensor):
    if T.cuda.is_available():
        tensor = tensor.cuda()
    return tensor

class Evaluate_r(object):
    def __init__(self, data_path, opt, batch_size):
        self.vocab = Vocab(vocab_path, vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='eval',
                               batch_size=batch_size, single_pass=True)
        self.opt = opt
        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        checkpoint = T.load(os.path.join(save_model_path, self.opt.load_model))
        self.model.load_state_dict(checkpoint["model_dict"])


    def print_original_predicted(self, decoded_sents, ref_sents, article_sents, loadfile):
        filename = "test_"+loadfile.split(".")[0]+".txt"
    
        with open(os.path.join(save_example_path,filename), "w") as f:
            for i in range(len(decoded_sents)):
                f.write("article: "+article_sents[i] + "\n")
                f.write("ref: " + ref_sents[i] + "\n")
                f.write("dec: " + decoded_sents[i] + "\n\n")

    def evaluate_batch(self, print_sents = False):

        self.setup_valid()
        batch = self.batcher.next_batch()
        start_id = self.vocab.word2id(START_DECODING)
        end_id = self.vocab.word2id(STOP_DECODING)
        unk_id = self.vocab.word2id(UNKNOWN_TOKEN)
        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data(batch)

            with T.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            #Summarization
            with T.autograd.no_grad():
                pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id)

            for i in range(len(pred_ids)):
                decoded_words = outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i])
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)
            batch = self.batcher.next_batch()
        load_file = self.opt.load_model

        if print_sents:
            self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file)

        scores = rouge.get_scores(decoded_sents, ref_sents, avg = True)
        if self.opt.task == "test":
            print(load_file, "scores:", scores)
        else:
            rouge_l = scores["rouge-l"]["f"]
            print(load_file, "rouge_l:", "%.4f" % rouge_l)
            print('Scores:', scores)
        
        #Rouge 1
        rouge_1 = scores['rouge-1']['f']
        #Rouge 2
        rouge_2 = scores['rouge-2']['f']
        #Rouge L
        rouge_l = scores['rouge-l']['f']

        return rouge_1, rouge_2, rouge_l

##Evaluation - Bert Score

In [16]:
def get_cuda(tensor):
    if T.cuda.is_available():
        tensor = tensor.cuda()
    return tensor

class Evaluate_b(object):
    def __init__(self, data_path, opt, batch_size):
        self.vocab = Vocab(vocab_path, vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='eval',
                               batch_size=batch_size, single_pass=True)
        self.opt = opt

        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        checkpoint = T.load(os.path.join(save_model_path, self.opt.load_model))
        self.model.load_state_dict(checkpoint["model_dict"])


    def print_original_predicted(self, decoded_sents, ref_sents, article_sents, loadfile):
        filename = "test_"+loadfile.split(".")[0]+".txt"
    
        with open(os.path.join(save_example_path,filename), "w") as f:
            for i in range(len(decoded_sents)):
                f.write("article: "+article_sents[i] + "\n")
                f.write("ref: " + ref_sents[i] + "\n")
                f.write("dec: " + decoded_sents[i] + "\n\n")

    def evaluate_batch(self, print_sents = False):

        self.setup_valid()
        batch = self.batcher.next_batch()
        start_id = self.vocab.word2id(START_DECODING)
        end_id = self.vocab.word2id(STOP_DECODING)
        unk_id = self.vocab.word2id(UNKNOWN_TOKEN)
        decoded_sents = []
        ref_sents = []
        article_sents = []
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data(batch)

            with T.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            #Summarization
            with T.autograd.no_grad():
                pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id)

            for i in range(len(pred_ids)):
                decoded_words = outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i])
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)

            batch = self.batcher.next_batch()

        load_file = self.opt.load_model
        _,_,f = bert_score.score(decoded_sents, ref_sents, lang='en', verbose = False)

        print('{}  score:{}'.format(load_file, f.mean()))
        gc.collect()
        
        if print_sents:
            self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file)
        return f.mean()

# EXPERIMENT

In [17]:
#Model hasil running
model_folder = 'drive/My Drive/PML/CNNDM_Test'

In [18]:
!ls 'drive/My Drive/PML/CNNDM_Test'

model  test  vocab


In [19]:
test_data_path = (model_folder + '/test/test_*')
vocab_path = 	(model_folder+ '/vocab')	  

# Hyperparameters
hidden_dim = 400
emb_dim = 200
batch_size = 50
max_enc_steps = 400		
max_dec_steps = 100		
beam_size = 5
min_dec_steps= 3
vocab_size = 30000 

lr = 0.001
rand_unif_init_mag = 0.02
trunc_norm_init_std = 1e-4

eps = 1e-12
max_iterations = 10000
max_batch_queue = 100

intra_encoder = True
intra_decoder = True

## Testing RL Reward Rouge

In [20]:
#RL reward rouge:
save_model_path =  (model_folder + '/model/')           
model_name_for_r_testing = 'RL(r).tar'
model_name_for_b_testing = 'RL(r).tar' 

In [21]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", type=str, default="test", choices=["validate","test"])
    parser.add_argument("--start_from", type=str, default=None)             
    parser.add_argument("--load_model", type=str, default=model_name_for_r_testing)  

    opt, unknown = parser.parse_known_args()

    #Eval
    eval_r = Evaluate_r(test_data_path, opt, batch_size)  
    r_rlr_1, r_rlr_2, r_rlr_L = eval_r.evaluate_batch()

    opt.load_model= model_name_for_b_testing    
    eval_b = Evaluate_b(test_data_path, opt, batch_size)
    rlr_bert_testing = eval_b.evaluate_batch()

    print('ROUGE_1: {:.4f} ; ROUGE_2: {:.4f}; ROUGE_L: {:.4f}; BERTScore: {:.4f}'.format(r_rlr_1, r_rlr_2, r_rlr_L, rlr_bert_testing))

max_size of vocab was specified as 30000; we now have 30000 words. Stopping reading.
Finished constructing vocabulary of 30000 total words. Last word added: moles
example_generator completed reading all datafiles. No more data.
INFO:tensorflow:The example generator for this example queue filling thread has exhausted data.
INFO:tensorflow:single_pass mode is on, so we've finished reading dataset. This thread is stopping.

INFO:tensorflow:Finished reading dataset in single_pass mode.
RL(r) testing.tar scores: {'rouge-1': {'f': 0.2792071217548919, 'p': 0.23658517128148124, 'r': 0.35713651610061982}, 'rouge-2': {'f': 0.10220711289029155, 'p': 0.0865442180911029, 'r': 0.12471812648659011}, 'rouge-l': {'f': 0.3392499018214784, 'p': 0.34383582711081823, 'r': 0.3558461987570033}}


max_size of vocab was specified as 30000; we now have 30000 words. Stopping reading.
Finished constructing vocabulary of 30000 total words. Last word added: moles
example_generator completed reading all datafiles. N

## Testing RL Reward BertScore

In [22]:
save_model_path =  (model_folder + '/model')      
model_name_for_r_testing = 'RL(b) for ROUGE.tar'
model_name_for_b_testing = 'RL(b) for BERT.tar'

In [24]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", type=str, default="test", choices=["validate","test"])
    parser.add_argument("--start_from", type=str, default=None)             
    parser.add_argument("--load_model", type=str, default=model_name_for_r_testing)  
    
    opt, unknown = parser.parse_known_args()

    #Eval
    eval_r = Evaluate_r(test_data_path, opt, batch_size)        
    r_rlb_1, r_rlb_2, r_rlb_L = eval_r.evaluate_batch()

    opt.load_model= model_name_for_b_testing    
    eval_b = Evaluate_b(test_data_path, opt, batch_size)
    rlb_bert_testing = eval_b.evaluate_batch()
    print('ROUGE_1: {:.4f} ; ROUGE_2: {:.4f}; ROUGE_L: {:.4f}; BERTScore: {:.4f}'.format(r_rlb_1, r_rlb_2, r_rlb_L, rlb_bert_testing))

max_size of vocab was specified as 30000; we now have 30000 words. Stopping reading.
Finished constructing vocabulary of 30000 total words. Last word added: moles
example_generator completed reading all datafiles. No more data.
INFO:tensorflow:The example generator for this example queue filling thread has exhausted data.
INFO:tensorflow:single_pass mode is on, so we've finished reading dataset. This thread is stopping.

INFO:tensorflow:Finished reading dataset in single_pass mode.
RL(b) for ROUGE testing.tar scores: {'rouge-1': {'f': 0.31452751289110214, 'p': 0.33677212860126349, 'r': 0.31581721909211306}, 'rouge-2': {'f': 0.12831026385390516, 'p': 0.13149961789026291, 'r': 0.12681581928114970}, 'rouge-l': {'f': 0.31713614191827518, 'p': 0.36232148112912141, 'r': 0.33258529182790112}}


max_size of vocab was specified as 30000; we now have 30000 words. Stopping reading.
Finished constructing vocabulary of 30000 total words. Last word added: moles
example_generator completed reading al

## Hasil

In [25]:
data_frame = {'ROUGE 1':[ r_rlr_1, r_rlb_1], 'ROUGE 2':[r_rlr_2, r_rlb_2], 'ROUGE L':[r_rlr_L, r_rlb_L], 
              'BERTScore':[float(rlr_bert_testing), float(rlb_bert_testing)]}
df = pd.DataFrame(data_frame, index= ['RL(r)', 'RL(b)']).round(4)
df

Unnamed: 0,ROUGE 1,ROUGE 2,ROUGE L,BERTScore
RL(r),0.2792,0.1022,0.3392,0.8132
RL(b),0.3145,0.1283,0.3171,0.8384
