In [0]:
try:
    %tensorflow_version 2.x
except Exception:   
    pass

import tensorflow as tf
import os
import numpy as np

Load the text files

In [0]:
if not os.path.exists('tok.valid.abstract.txt'):
  !wget -O 'tok.valid.abstract.txt' 'https://www.dropbox.com/s/0c6e9yf8yhf9a75/tok.valid.abstract.txt?dl=1'
if not os.path.exists('tok.valid.body.txt'):
  !wget -O 'tok.valid.title.txt' 'https://www.dropbox.com/s/aiy87847kusb7ju/tok.valid.title.txt?dl=1'

In [0]:
with open('tok.valid.abstract.txt','rb') as f:
    body_data = f.read().decode("utf-8").split('\n')
    
with open('tok.valid.title.txt','rb') as f:
    target_data = f.read().decode("utf-8").split('\n')

Create vocabulary using keras tokenizer

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(body_data)
vocab = dict(map(reversed, tokenizer.word_index.items()))
vocab[len(vocab)+1] = '<s>'
vocab[len(vocab)+1] = '</s>'

Convert text to numeric indicies

In [0]:
body_seqs=tokenizer.texts_to_sequences(body_data)
target_seqs=tokenizer.texts_to_sequences(target_data)

Add start and end tokens to all sequences. Start token index = vocab.size and end token index = vocab.size+1

In [0]:
body_seqs = [[len(vocab)-1]+seq+[len(vocab)] for seq in body_seqs]
target_seqs = [[len(vocab)-1]+seq+[len(vocab)] for seq in target_seqs]

Pad all sequences with zeros up to the maximum sequence length

In [0]:
max_len_body = max([len(seq) for seq in body_seqs])
max_len_target = max([len(seq) for seq in target_seqs])
body_seqs=tf.keras.preprocessing.sequence.pad_sequences(body_seqs, maxlen=max_len_body, padding="post")
target_seqs=tf.keras.preprocessing.sequence.pad_sequences(target_seqs, maxlen=max_len_target, padding="post")

Reserve a held-out set

In [0]:
m=len(body_seqs)//10
body_seqs_val = body_seqs[:m]
body_seqs = body_seqs[m:]
target_seqs_val = target_seqs[:m]
target_seqs = target_seqs[m:] 
print(len(body_seqs_val),len(target_seqs_val))

Create a dataset from which batches of a certain size can be extracted

In [0]:
buffer_size = len(body_seqs)
batch_size = 16
dataset_tensor = tf.data.Dataset.from_tensor_slices((body_seqs,target_seqs))
validation_tensor = tf.data.Dataset.from_tensor_slices((body_seqs_val,target_seqs_val))
batch_size_val = 1
dataset_tensor = dataset_tensor.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
validation_tensor = validation_tensor.shuffle(buffer_size).batch(1, drop_remainder=True)

In [0]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super().__init__()
        
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
            self.hidden_units,
            return_state=True,
            return_sequences=True
        ))
        
    def call(self, encoder_input,encoder_state):
        # inputs: encoder_input = (batch_size, seq_length)
        #         encoder_state = (batch_size, hidden_units)
        
        # embedding look-up layer
        encoder_emb = self.embedding(encoder_input) # (batch_size,seq_length,embedding_dim)
        
        # encoder_output = (batch_size,seq_length,hidden_units)
        # states = (batch_size,hidden_units)
        encoder_output, state_h_fwd, state_c_fwd, state_h_back, state_c_back = self.bi_lstm(encoder_emb,initial_state=encoder_state)
        encoder_states = [state_h_fwd,state_c_fwd]

        return encoder_output, encoder_states

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_units):
        super().__init__()
        
        self.Wh = tf.keras.layers.Dense(hidden_units) # weight matrix for encoder hidden state
        self.Ws = tf.keras.layers.Dense(hidden_units) # weight matrix for decoder state
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, decoder_state, encoder_output):
        # inputs: decoder_state = (batch_size, hidden_units)
        #         encoder_output = (batch_size, seq_length, hidden_units)
        
        # expand dimension of decoder state to allow addition
        decoder_state = tf.expand_dims(decoder_state, 1) # decoder_state = (batch_size, 1, hidden_units)
        
        # calculate attention scores
        score = self.V(tf.nn.tanh(self.Wh(encoder_output) # (batch_size, seq_length, hidden_units) -> (batch_size, seq_length, attention_units)
                      +self.Ws(decoder_state))) # (batch_size, 1, hidden_units) -> (batch_size, 1, attention_units)
        
        # attention_weights = (batch_size, length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector = (batch_size, length, hidden_units)
        context_vector = attention_weights * encoder_output
        # context_vector = (batch_size, hidden_units)
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [0]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super().__init__()
        
        self.batch_size = batch_size
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            self.hidden_units,
            return_sequences=True,
            return_state=True,
        )
        self.attention = BahdanauAttention(self.hidden_units)
        self.W1 = tf.keras.layers.Dense(hidden_units)
        self.W2 = tf.keras.layers.Dense(vocab_size)
      
    def call(self, decoder_input, decoder_init_state, encoder_output,mode="Train"):
        # inputs: decoder_input = (batch_size, 1)
        #         decoder_state = (batch_size, hidden_units)
        #         encoder_output = (batch_size, seq_length, hidden_units)
        if mode == "Train":
            # embedding look-up layer
            decoder_input = self.embedding(decoder_input) # (batch_size, seq_length, embeding_dim)

            # decoder_output = (batch_size,seq_length,hidden_units)
            # states = (batch_size,hidden_units)
            decoder_output, state_h, state_c = self.lstm(decoder_input,initial_state=decoder_init_state)
            decoder_state = [state_h,state_c]

            # get context vector and attention weights
            context_vector, attention_weights = self.attention(state_h, encoder_output)

            # concatenate context vector and decoder state 
            concat_vector = tf.concat([context_vector,state_h], axis=-1)
            # reshape to 1d array
            concat_vector = tf.reshape(concat_vector, (-1, concat_vector.shape[1]))
            # create vocabulary distribution
            p_vocab = tf.nn.log_softmax(self.W2(self.W1(concat_vector)))
            return p_vocab 

        if mode == "Test":
            # inputs: decoder_input = (batch_size, 1)
            #         decoder_state = (batch_size, hidden_units)
            #         encoder_output = (batch_size, seq_length, hidden_units)
            
            # embedding look-up layer
            decoder_input = self.embedding(decoder_input) # (batch_size, seq_length, hidden_units)

            # decoder_output = (batch_size,seq_length,hidden_units)
            # states = (batch_size,hidden_units)
            decoder_output, state_h, state_c = self.lstm(decoder_input,initial_state=decoder_init_state)
            decoder_states = [state_h,state_c]

            # get context vector and attention weights
            context_vector, attention_weights = self.attention(state_h, encoder_output)

            # concatenate context vector and decoder state 
            concat_vector = tf.concat([context_vector,state_h], axis=-1)
            # reshape to 1d array
            concat_vector = tf.reshape(concat_vector, (-1, concat_vector.shape[1]))
            # create vocabulary distribution
            p_vocab = tf.nn.log_softmax(self.W2(self.W1(concat_vector)))
            return p_vocab, decoder_states

Initialize encoder and decoder

In [0]:
embedding_dim = 128
hidden_units = 256
encoder = Encoder(len(vocab)+1, embedding_dim, hidden_units) # +2 on due to start and end tokens
decoder = Decoder(len(vocab)+1, embedding_dim, hidden_units)

Define optimizer and loss function

In [0]:
optimizer = tf.keras.optimizers.Adam()

def masked_nll(p_vocab,target):
    # apply a mask such that pad zeros do not affect the loss
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss = -p_vocab
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return loss

Define a function for performing one training step (one batch)

In [0]:
@tf.function
def train_step(encoder_input, decoder_target):
    """Function which performs one training step (batch)"""
    loss = tf.zeros(batch_size)
    with tf.GradientTape() as tape:
        # run body_sequence input through encoder
        encoder_init_states = [tf.zeros((batch_size, encoder.hidden_units)) for i in range(4)]
        encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
        # initialize decoder with encoder forward state
        decoder_state = encoder_states
        
        # loop over each word in target sequence
        for t in range(decoder_target.shape[1]-1):
            # run decoder input through decoder and generate vocabulary distribution
            decoder_input_t = decoder_target[:,t]
            decoder_target_t = decoder_target[:,t+1]
            # get vocabulary distribution for each batch at time t
            p_vocab = decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output)
            # for each batch get the probability of the target word at time t+1
            p_vocab_list = []
            for i in range(len(decoder_target_t)):
                p_vocab_list.append(p_vocab[i,decoder_target_t[i]])
            p_vocab_target = tf.stack(p_vocab_list)
            # calculate the loss at each time step t and add to current loss
            loss += masked_nll(p_vocab_target,decoder_target_t)
            
        # get the non-padded length of each sequence in the batch
        seq_len_mask = tf.cast(tf.math.logical_not(tf.math.equal(decoder_target, 0)),tf.float32)
        batch_seq_len = tf.reduce_sum(seq_len_mask,axis=1)

    # get batch loss by dividing the loss of each batch by the target sequence length and mean
    batch_loss = tf.reduce_mean(loss/batch_seq_len)
    
    # update trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

Training loop

In [0]:
from tqdm import tqdm_notebook as tqdm

epochs = 5

epoch_loss = tf.keras.metrics.Mean()
with tqdm(total=epochs) as epoch_progress:
    for epoch in range(epochs):
        epoch_loss.reset_states()

        with tqdm(total=len(body_seqs) // batch_size) as batch_progress:
            for batch, (encoder_input, decoder_target) in enumerate(dataset_tensor):
                batch_loss = train_step(encoder_input, decoder_target)
                epoch_loss(batch_loss)
                
                if (batch % 10) == 0:
                    batch_progress.set_description(f'Epoch {epoch + 1}')
                    batch_progress.set_postfix(Batch=batch, Loss=batch_loss.numpy())
                batch_progress.update()
        
        epoch_progress.set_description(f'Epoch {epoch + 1}')
        epoch_progress.set_postfix(Loss=epoch_loss.result())
        epoch_progress.update()

In [0]:
def best_k_candidates(prev,k,encoder_output):
    decoder_target_t = tf.expand_dims(tf.expand_dims(prev[0][-1],0),1)
    p_vocab, decoder_states = decoder(decoder_target_t,prev[2],encoder_output,mode="Test")
    size = len(vocab)+1
    idx = tf.argsort(p_vocab, direction='DESCENDING')  # sorted indices
    ranks = tf.argsort(idx, direction='ASCENDING')  # ranks
    filter_k = ranks < k
    p_vocab = p_vocab.numpy().reshape(size,)
    filter_k = filter_k.numpy().reshape(size,)
    best_k_candidates = [ [prev[0]+[x],p_vocab[x]+prev[1],decoder_states] for x in range(size) if filter_k[x] ]
    return best_k_candidates


In [0]:
def best_k_of_k2(best_k,k,completed,encoder_output):
  results = []
  for hypo in best_k:
    results = results + best_k_candidates(hypo, k, encoder_output)
  results = sorted(results,key = lambda x: x[1],reverse= True)[0:k]

  for result in results:  
    if result[0][-1] == len(vocab):
      k-=1
      results.remove(result)
      completed += [result]
  return results,k,completed

In [0]:
def val_step(encoder_input, beam_width):
      # run body_sequence input through encoder
      encoder_init_states = [tf.zeros((1, hidden_units)) for i in range(4)]
      encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
      # initialize decoder with last encoder states
      decoder_states = encoder_states
      prev = [[len(vocab)-1],0,decoder_states]
      # decoder_target_t = tf.expand_dims(tf.expand_dims(prev[0],0),1)
      # p_vocab = val_decoder(decoder_target_t,decoder_states,encoder_output)
      # log_p_vocab = map(lambda x: log(x),p_vocab.numpy())
      k = beam_width
      completed = []
      best_k = best_k_candidates(prev,k,encoder_output)
      for i in range(10):
        best_k,k,completed = best_k_of_k2(best_k,k,completed,encoder_output)
        if len(completed) == k:
          break 
      if len(completed) == 0:
        completed = best_k
      for hypo in completed:
        hypo[1]/=len(hypo[0])
        hypo = [hypo[1],hypo[2]]         
      best_k = sorted(completed,key=lambda x: x[1],reverse=True)
      return best_k[0][0]

In [0]:
k = 5
from tqdm import tqdm_notebook as tqdm
with tqdm(total=len(body_seqs_val)) as batch_progress:
    for i,(encoder_input, x) in enumerate(validation_tensor):
        if i>5:
          break
        print(i," : ",body_data[i],"\n",target_data[i])
        pred = val_step(encoder_input,5)
        pred = list(map(lambda x: vocab[x], pred))
        print(str(pred))