# FairSeq

Using library to help pre-processing and training Seq2Seq models

In [1]:
!pip install fairseq



In [2]:
import torch.nn as nn
from fairseq import utils
from fairseq.models import FairseqEncoder

## Encoder

We will use a GRU to encode the input sentence and provide a context to the decoder

In [3]:
class SimpleGRUEncoder(FairseqEncoder):

    def __init__(
        self, args, dictionary, embed_dim=128, hidden_dim=128, dropout=0.1,
    ):
        super().__init__(dictionary)
        self.args = args

        # Our encoder will embed the inputs before feeding them to the GRU.
        self.embed_tokens = nn.Embedding(
            num_embeddings=len(dictionary),
            embedding_dim=embed_dim,
            padding_idx=dictionary.pad(),
        )
        self.dropout = nn.Dropout(p=dropout)

        # We'll use a single-layer, unidirectional GRU for simplicity.
        self.gru = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
        )

    def forward(self, src_tokens, src_lengths):
        # The inputs to the ``forward()`` function are determined by the
        # Task, and in particular the ``'net_input'`` key in each
        # mini-batch. We discuss Tasks in the next tutorial, but for now just
        # know that *src_tokens* has shape `(batch, src_len)` and *src_lengths*
        # has shape `(batch)`.

        # Note that the source is typically padded on the left. This can be
        # configured by adding the `--left-pad-source "False"` command-line
        # argument, but here we'll make the Encoder handle either kind of
        # padding by converting everything to be right-padded.
        if self.args.left_pad_source:
            # Convert left-padding to right-padding.
            src_tokens = utils.convert_padding_direction(
                src_tokens,
                padding_idx=self.dictionary.pad(),
                left_to_right=True
            )

        # Embed the source.
        x = self.embed_tokens(src_tokens)

        # Apply dropout.
        x = self.dropout(x)

        # Pack the sequence into a PackedSequence object to feed to the GRU.
        x = nn.utils.rnn.pack_padded_sequence(x, src_lengths, batch_first=True)

        # Get the output from the GRU.
        _outputs, final_hidden = self.gru(x)

        # Return the Encoder's output. This can be any object and will be
        # passed directly to the Decoder.
        return {
            # this will have shape `(bsz, hidden_dim)`
            'final_hidden': final_hidden.squeeze(0),
        }

    # Encoders are required to implement this method so that we can rearrange
    # the order of the batch elements during inference (e.g., beam search).
    def reorder_encoder_out(self, encoder_out, new_order):
        """
        Reorder encoder output according to `new_order`.

        Args:
            encoder_out: output from the ``forward()`` method
            new_order (LongTensor): desired order

        Returns:
            `encoder_out` rearranged according to `new_order`
        """
        final_hidden = encoder_out['final_hidden']
        return {
            'final_hidden': final_hidden.index_select(0, new_order),
        }

## Decoder

We then now build the decoder that takes the input and the context, and generate the output sequence

In [4]:
import torch
from fairseq.models import FairseqDecoder

class SimpleGRUDecoder(FairseqDecoder):

    def __init__(
        self, dictionary, encoder_hidden_dim=128, embed_dim=128, hidden_dim=128,
        dropout=0.1,
    ):
        super().__init__(dictionary)

        # Our decoder will embed the inputs before feeding them to the GRU.
        self.embed_tokens = nn.Embedding(
            num_embeddings=len(dictionary),
            embedding_dim=embed_dim,
            padding_idx=dictionary.pad(),
        )
        self.dropout = nn.Dropout(p=dropout)

        self.gru = nn.GRU(
            # For the first layer we'll concatenate the Encoder's final hidden
            # state with the embedded target tokens.
            input_size=encoder_hidden_dim + embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
        )

        # Define the output projection.
        self.output_projection = nn.Linear(hidden_dim, len(dictionary))

    # During training Decoders are expected to take the entire target sequence
    # (shifted right by one position) and produce logits over the vocabulary.
    # The *prev_output_tokens* tensor begins with the end-of-sentence symbol,
    # ``dictionary.eos()``, followed by the target sequence.
    def forward(self, prev_output_tokens, encoder_out):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for
                encoder-side attention

        Returns:
            tuple:
                - the last decoder layer's output of shape
                  `(batch, tgt_len, vocab)`
                - the last decoder layer's attention weights of shape
                  `(batch, tgt_len, src_len)`
        """
        bsz, tgt_len = prev_output_tokens.size()

        # Extract the final hidden state from the Encoder.
        final_encoder_hidden = encoder_out['final_hidden']

        # Embed the target sequence, which has been shifted right by one
        # position and now starts with the end-of-sentence symbol.
        x = self.embed_tokens(prev_output_tokens)

        # Apply dropout.
        x = self.dropout(x)

        # Concatenate the Encoder's final hidden state to *every* embedded
        # target token.
        x = torch.cat(
            [x, final_encoder_hidden.unsqueeze(1).expand(bsz, tgt_len, -1)],
            dim=2,
        )

        # Using PackedSequence objects in the Decoder is harder than in the
        # Encoder, since the targets are not sorted in descending length order,
        # which is a requirement of ``pack_padded_sequence()``. Instead we'll
        # feed nn.GRU directly.
        initial_state = (
            final_encoder_hidden.unsqueeze(0),  # hidden
            torch.zeros_like(final_encoder_hidden).unsqueeze(0),  # cell
        )
        output, _ = self.gru(
            x.transpose(0, 1),  # convert to shape `(tgt_len, bsz, dim)`
            initial_state,
        )
        x = output.transpose(0, 1)  # convert to shape `(bsz, tgt_len, hidden)`

        # Project the outputs to the size of the vocabulary.
        x = self.output_projection(x)

        # Return the logits and ``None`` for the attention weights
        return x, None

## Register The Model

In [5]:
from fairseq.models import FairseqEncoderDecoderModel, register_model

# Note: the register_model "decorator" should immediately precede the
# definition of the Model class.

@register_model('simple_gru')
class SimpleGRUModel(FairseqEncoderDecoderModel):

    @staticmethod
    def add_args(parser):
        # Models can override this method to add new command-line arguments.
        # Here we'll add some new command-line arguments to configure dropout
        # and the dimensionality of the embeddings and hidden states.
        parser.add_argument(
            '--encoder-embed-dim', type=int, metavar='N',
            help='dimensionality of the encoder embeddings',
        )
        parser.add_argument(
            '--encoder-hidden-dim', type=int, metavar='N',
            help='dimensionality of the encoder hidden state',
        )
        parser.add_argument(
            '--encoder-dropout', type=float, default=0.1,
            help='encoder dropout probability',
        )
        parser.add_argument(
            '--decoder-embed-dim', type=int, metavar='N',
            help='dimensionality of the decoder embeddings',
        )
        parser.add_argument(
            '--decoder-hidden-dim', type=int, metavar='N',
            help='dimensionality of the decoder hidden state',
        )
        parser.add_argument(
            '--decoder-dropout', type=float, default=0.1,
            help='decoder dropout probability',
        )

    @classmethod
    def build_model(cls, args, task):
        # Fairseq initializes models by calling the ``build_model()``
        # function. This provides more flexibility, since the returned model
        # instance can be of a different type than the one that was called.
        # In this case we'll just return a SimpleGRUModel instance.

        # Initialize our Encoder and Decoder.
        encoder = SimpleGRUEncoder(
            args=args,
            dictionary=task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            hidden_dim=args.encoder_hidden_dim,
            dropout=args.encoder_dropout,
        )
        decoder = SimpleGRUDecoder(
            dictionary=task.target_dictionary,
            encoder_hidden_dim=args.encoder_hidden_dim,
            embed_dim=args.decoder_embed_dim,
            hidden_dim=args.decoder_hidden_dim,
            dropout=args.decoder_dropout,
        )
        model = SimpleGRUModel(encoder, decoder)

        # Print the model architecture.
        print(model)

        return model

## Register The Architecture

In [6]:
from fairseq.models import register_model_architecture

# The first argument to ``register_model_architecture()`` should be the name
# of the model we registered above (i.e., 'simple_gru'). The function we
# register here should take a single argument *args* and modify it in-place
# to match the desired architecture.

@register_model_architecture('simple_gru', 'tutorial_simple_gru')
def tutorial_simple_gru(args):
    # We use ``getattr()`` to prioritize arguments that are explicitly given
    # on the command-line, so that the defaults defined below are only used
    # when no other value has been specified.
    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
    args.encoder_hidden_dim = getattr(args, 'encoder_hidden_dim', 256)
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
    args.decoder_hidden_dim = getattr(args, 'decoder_hidden_dim', 256)

# Download pre-cleaned datasets

In [7]:
!bash data/prepare-iwslt14.sh

Cloning Moses github repository (for tokenization scripts)...
fatal: destination path 'mosesdecoder' already exists and is not an empty directory.
Cloning Subword NMT repository (for BPE pre-processing)...
fatal: destination path 'subword-nmt' already exists and is not an empty directory.
Downloading data from http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz...
--2022-01-23 16:10:03--  http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19982877 (19M) [application/x-tar]
Saving to: ‘de-en.tgz.1’


2022-01-23 16:10:07 (5,31 MB/s) - ‘de-en.tgz.1’ saved [19982877/19982877]

Data successfully downloaded.
x de-en/
x de-en/IWSLT14.TED.dev2010.de-en.de.xml
x de-en/IWSLT14.TED.dev2010.de-en.en.xml
x de-en/IWSLT14.TED.tst2010.d

In [9]:
!fairseq-preprocess --source-lang de --target-lang en \
    --trainpref iwslt14.tokenized.de-en/train --validpref iwslt14.tokenized.de-en/valid --testpref iwslt14.tokenized.de-en/test \
    --destdir data-bin/iwslt14.tokenized.de-en \
    --workers 8

2022-01-23 16:11:03 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, tensorboard_logdir=None, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, min_loss_scale=0.0001, threshold_loss_scale=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, checkpoint_suffix='', checkpoint_shard_count=1, quantization_config_path=None, profile=False, criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None, lr_scheduler='fixed', scoring='bleu', task='translation', source_lang='de', target_lang='en', trainpref='iwslt14.tokenized.de-en/train', validpref='iwslt14.tokenized.de-en/valid', testpref='iwslt14.tokenized.de-en/test', align_suffix=None, destdir='data-bin/iwslt14.tokenized.de-en', thresholdtgt=0, thresholdsrc=0, tgtdict=None, srcdict=None

## Train the model

In [11]:
!fairseq-train iwslt14.tokenized.de-en --arch tutorial_simple_gru --optimizer adam --lr 0.005 --max-tokens 1200

usage: fairseq-train [-h] [--no-progress-bar] [--log-interval LOG_INTERVAL]
                     [--log-format LOG_FORMAT]
                     [--tensorboard-logdir TENSORBOARD_LOGDIR] [--seed SEED]
                     [--cpu] [--tpu] [--bf16] [--memory-efficient-bf16]
                     [--fp16] [--memory-efficient-fp16]
                     [--fp16-no-flatten-grads]
                     [--fp16-init-scale FP16_INIT_SCALE]
                     [--fp16-scale-window FP16_SCALE_WINDOW]
                     [--fp16-scale-tolerance FP16_SCALE_TOLERANCE]
                     [--min-loss-scale MIN_LOSS_SCALE]
                     [--threshold-loss-scale THRESHOLD_LOSS_SCALE]
                     [--user-dir USER_DIR]
                     [--empty-cache-freq EMPTY_CACHE_FREQ]
                     [--all-gather-list-size ALL_GATHER_LIST_SIZE]
                     [--model-parallel-size MODEL_PARALLEL_SIZE]
                     [--checkpoint-suffix CHECKPOINT_SUFFIX]
       