<a href="https://colab.research.google.com/github/kazemnejad/addons/blob/seq2seq_example_draft/docs/tutorials/seq2seq_beam_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2019 The TensorFlow Authors.

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Seq2seq: Neural Machine Translation with Beam search

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/kazemnejad/addons/blob/seq2seq_example/examples/seq2seq_beam_decoder.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/kazemnejad/addons/blob/seq2seq_example/examples/seq2seq_beam_decoder.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

[Update button links]

## Overview

[Include a paragraph or two explaining what this example demonstrates, who should be interested in it, and what you need to know before you get started.]

## Setup

In [0]:
! pip install -q  --no-deps -e tfa-nightly

In [0]:
! pip install tf-addons-nightly

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [0]:
try:
  %tensorflow_version 2.x
except:
  pass

from collections import Counter

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

from tensorflow.keras.layers import Embedding
from tensorflow.keras import backend as K

from tensorflow_datasets.core.features.text import TokenTextEncoder

In [0]:
! cp -r /content/addons/bazel-bin/tensorflow_addons /content/addons/

In [0]:
VOCAB_SIZE = 10000
BATCH_SIZE = 64
EMBED_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2

## Prepare the Data

Download and prepare a toy machine translation dataset

In [0]:
# Select a small percentage of the dataset for training and testing.
train_split = tfds.Split.TRAIN.subsplit(tfds.percent[:5])
test_split = tfds.Split.TRAIN.subsplit(tfds.percent[-1:])
(dataset_train, dataset_test), info = tfds.load(
    name="para_crawl/enmt_plain_text",
    split=(train_split, test_split),
    with_info=True)
# Cache the language pair to use it later
lang_src, lang_tgt = info.supervised_keys[::-1]
print(f"lang_src, lang_tgt = {lang_src}, {lang_tgt}")

In [0]:
# Create an instance of tokenizer. Special tokens should be also provided,
# as this tokenizer instance will be used in the `TokenTextEncoder`
# to replace words with their vocabulary ids.
tokenizer = tfds.features.text.Tokenizer(reserved_tokens=['<s>', '</s>'])
# Create a vocabulary for both language pairs.
vocab_src, vocab_tgt = Counter(), Counter()
for d in dataset_train:
    tokens = tokenizer.tokenize(d[lang_src].numpy())
    vocab_src.update(tokens)
    tokens = tokenizer.tokenize(d[lang_tgt].numpy())
    vocab_tgt.update(tokens)

# Cut down the vocabulary size to a pre-defined number.
vocab_src = [w for w, _ in vocab_src.most_common(VOCAB_SIZE)]
vocab_tgt = ['<s>', '</s>'] + [w for w, _ in vocab_tgt.most_common(VOCAB_SIZE)]
# To convert words to their corresponding ids, you should 
# create two separate instance of `TokenTextEncoder` for each
# language
tok_encoder_src = TokenTextEncoder(
    vocab_src, lowercase=True, tokenizer=tokenizer)
tok_encoder_tgt = TokenTextEncoder(
    vocab_tgt, lowercase=True, tokenizer=tokenizer)

As for the preprocessing stage, the following steps should be done:
- Replace each word with its id from the vocabulary table.
- `<s>` and `</s>` should be added to the begging and end of each target sentence, respectively.
- Pad all setences within each batch to have the same length.
- Convert the labels to the one-hot format.
- Compute the `sample_weights` mask according to the padded batch to zero out the effect of padding values in the loss calculation.

In [0]:
def _encode_str(src, tgt):
    src_word_ids = tok_encoder_src.encode(src.numpy())
    tgt_input_word_ids = tok_encoder_tgt.encode(
        '<s> '.encode('utf8') + tgt.numpy())
    tgt_output_word_ids = tok_encoder_tgt.encode(
        tgt.numpy() + ' </s>'.encode('utf8'))
    return src_word_ids, tgt_input_word_ids, tgt_output_word_ids


def _encode_map_fn(src, tgt):
    return tf.py_function(
        _encode_str, inp=(src, tgt), Tout=(tf.int64, tf.int64, tf.int64))


def _append_sample_weights(src, tgt_inp, tgt_out):
    weights = tf.cast(tf.not_equal(tgt_out, 0), tf.float32)
    return (src, tgt_inp), tgt_out, weights


def _convert_labels_to_one_hot(inputs, labels, sample_weights):
    labels = tf.one_hot(labels, depth=tok_encoder_tgt.vocab_size)
    return inputs, labels, sample_weights


def prepare_dataset(dataset):
    dataset = dataset.map(lambda x: (x[lang_src], x[lang_tgt]))
    # shape: ([None], [None], [None]), dtype: (tf.int64, tf.int64, tf.int64)
    dataset = dataset.map(_encode_map_fn)

    pad_value = tf.constant(0, dtype=tf.int64)
    # shape: ([batch_size, None], [batch_size, None], [batch_size, None]),
    dataset = dataset.padded_batch(
        BATCH_SIZE,
        padded_shapes=([None], [None], [None]),
        padding_values=(pad_value, pad_value, pad_value))
    
    # Finally, each row is an (input, labels, sample_weights)-triple,
    # where the input itself is a (source, shifted_target)-tuple
    # So the final shape is: (
    #     ([batch_size, max_seq_len], [batch_size, max_seq_len]),
    #     [batch_size, max_seq_len],
    #     [batch_size, max_seq_len]
    # )
    dataset = dataset.map(_append_sample_weights)

    # To make the dataset compatible with Keras built-in training loops,
    # labels should be provided in the one-hot format. So the labels' shape is:
    # [batch_size, max_seq_len, vocab_size]
    dataset = dataset.map(_convert_labels_to_one_hot)
    
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

And finally, apply those transformations on both splits.

In [0]:
ds_train = prepare_dataset(dataset_train)
ds_valid = prepare_dataset(dataset_test)

## Build the Model

We'll use model-subclassing to define the model. 

In [0]:
class Seq2SeqModel(tf.keras.models.Model):
    def __init__(self):
        super(Seq2SeqModel, self).__init__()
        # Define the embedding layers, one for the source language
        # and one for the target.
        self.src_embedding = tf.keras.layers.Embedding(
            input_dim=tok_encoder_src.vocab_size,
            output_dim=EMBED_DIM,
            mask_zero=True)
        self.tgt_embedding = tf.keras.layers.Embedding(
            input_dim=tok_encoder_tgt.vocab_size,
            output_dim=EMBED_DIM,
            mask_zero=True)
        
        # Create a multi-layer LSTM encoder.
        self.encoder = tf.keras.layers.RNN(
            tf.keras.layers.StackedRNNCells([
                tf.keras.layers.LSTMCell(HIDDEN_DIM) for _ in range(NUM_LAYERS)
            ]),
            return_sequences=True,
            return_state=True)

        # Create an Attention Mechanism with HIDDEN_DIM as 
        # the depth of the query mechanism. The memory is set 
        # in the actual model invocation.
        self.attn_mch = tfa.seq2seq.LuongAttention(HIDDEN_DIM)

        # The decoder is also a multi-layer LSTM network, however 
        # we should keep a reference to its cell in order to use that 
        # in both Train(Basic) & inference(BeamSearch) Decoders
        self.decoder_cell = tf.keras.layers.StackedRNNCells(
            [tf.keras.layers.LSTMCell(HIDDEN_DIM) for _ in range(NUM_LAYERS)])
        
        # Wrap the cell to add the attention functionality
        self.decoder_cell = tfa.seq2seq.AttentionWrapper(
            cell=self.decoder_cell,
            attention_mechanism=self.attn_mch,
            alignment_history=False)
        
        # Create a Dense layer for the vocabulary projection
        vocab_proj_layer = tf.keras.layers.Dense(tok_encoder_tgt.vocab_size)

        # Create an instance of BasicDecoder to be used in the training time.
        self.train_decoder = tfa.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            sampler=tfa.seq2seq.sampler.TrainingSampler(),
            output_layer=vocab_proj_layer)

Utilize the LSTM encoder to create a represention of the input sentence:

In [0]:
def _encode_input(self, src_words, training):
    embeds = self.src_embedding(src_words)
    mask = self.src_embedding.compute_mask(src_words)
    encoder_outputs, state_h, state_c = self.encoder(embeds, mask=mask, training=training)
    return encoder_outputs, mask, (state_h, state_c)

Seq2SeqModel._encode_input = _encode_input

In [0]:
def _decode(self, encoder_outputs, tgt_input_words, training):
    tgt_input_embeds = self.tgt_embedding(tgt_input_words)
    tgt_input_mask = self.tgt_embedding.compute_mask(tgt_input_words)
    
    enc_hiddens, enc_hiddens_mask, enc_final_state = encoder_outputs
    self.attn_mch(enc_hiddens, memory_mask=enc_hiddens_mask, setup_memory=True)

    decoder_initial_state = self.decoder_cell.get_initial_state(
        tgt_input_embeds)
    decoder_initial_state = decoder_initial_state.clone(
        cell_state=enc_final_state)

    outputs, _, _ = self.train_decoder(
        tgt_input_embeds,
        initial_state=decoder_initial_state,
        training=training,
        mask=tgt_input_mask)

    logits = outputs.rnn_output

    return logits

Seq2SeqModel._decode = _decode

In [0]:
def call(self, inputs, training=None):
    src_words, tgt_input_words = inputs

    encoder_outputs = self._encode_input(src_words, training)

    logits = self._decode(encoder_outputs, tgt_input_words, training)

    return logits

Seq2SeqModel.call = call

Train

In [0]:
ds_train = prepare_dataset(dataset_train)
ds_valid = prepare_dataset(dataset_test)

model2 = Seq2SeqModel()
model2.compile(optimizer='adam',
              loss=tfa.seq2seq.SequenceLoss(),
              sample_weight_mode="temporal")
model2.fit(ds_train, epochs=1, steps_per_epoch=100000, validation_data=ds_valid)

In [0]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu', input_shape=(None, 5)),
    tf.keras.layers.Dense(3)
])

## GitHub workflow

* Be consistent about how you save your notebooks, otherwise the JSON diffs are messy.
* This notebook has the "Omit code cell output when saving this notebook" option set. GitHub refuses to diff notebooks with large diffs (inline images).
* [ReviewNB.com](http://reviewnb.com) can help with diffs. This is linked in a comment on a notebook pull request.
* Use the [Open in Colab](https://chrome.google.com/webstore/detail/open-in-colab/iogfkhleblhcpcekbiedikdehleodpjo) extension to open a GitHub notebook in Colab.
* The easiest way to edit a notebook in GitHub is to open it with Colab from the branch you want to edit. Then use File --> Save a copy in GitHub, which will save it back to the branch you opened it from.
* For PRs it's helpful to post a direct Colab link to the PR head: https://colab.research.google.com/github/{USER}/{REPO}/blob/{BRANCH}/{PATH}.ipynb