### Sequence-to-Sequence for Machine Translation with Attention and Bi-directional LSTM - [notebook](https://github.com/fastai/course-v3/blob/master/nbs/dl2/translation.ipynb), [video](https://course18.fast.ai/lessons/lesson11.html), [notes](https://medium.com/@hiromi_suenaga/deep-learning-2-part-2-lesson-11-61477d24dc34)

**Todos**
*   Import Text Translation Data Bundle from app_lib, and Bleu Metrics
*   Try arch with AWD-LSTM
*   Track results in DTR
*   Print out intermediate results - ie show what the translated text we're outputting is, so we can see what it is really doing internally
*   One Cycle scheduler etc
*   Cleanup cells with Fastai code
*   Make sure App and Arch are clean and as per standard template




### Import KD Libraries

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import IPython.core.debugger as db
from pathlib import Path
import pandas as pd
import torch
#import torch.nn.functional as F
from torch import tensor, nn
from torch.nn.utils.rnn import pad_sequence

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
gd_path = 'gdrive/My Drive/Colab Data/fastai-v3'  #change dir to your project folder
gn_path = 'gdrive/My Drive/Colab Notebooks'  #change dir to your project folder

import sys
sys.path.insert(1, gn_path + '/exp')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
from nb_data import *
from nb_training import *
from nb_optimiser import *

### Build Seq2Seq Architecture

In [None]:
#----------------------------------------------------
# Encoder Module
#----------------------------------------------------
class Seq_Encoder(nn.Module):
  #----------------------------------------------------
  # Build the architecture with an Embedding and LSTM layer, along with some Dropouts.
  # Optionally, it has a Linear layer to reshape the LSTM hidden state as needed for
  # the Decoder
  #----------------------------------------------------
  def __init__(self, n_h, n_layers, n_dirs, emb, inp_p=0.15, enc_p=0.25, hid_p=0.05):
    super().__init__()

    self.n_h, self.n_layers = n_h, n_layers
    self.n_dirs = n_dirs

    self.enc_emb = emb
    _, emb_sz = emb.weight.size()
    self.enc_inp_drop = nn.Dropout(p=inp_p)
    self.enc_lstm = nn.LSTM(emb_sz, n_h, n_layers, batch_first=True, bidirectional=(n_dirs == 2), dropout=enc_p)
    self.enc_hid_drop = nn.Dropout(p=hid_p)

    # The Linear layer's input size is determined by the Encoder LSTM while its output
    # size is determined by the Decoder LSTM
    #
    # Since both hidden state (h) and cell state (c) from the encoder are fed to
    # the Linear layer, its input size is 'n_h + n_h' and its output size is 'emb_sz + emb_sz'
    # In the case of a bidirectional LSTM, its input becomes '2 * (n_h + n_h)' but
    # its output remains as before ie. 'emb_sz + emb_sz'. The output is not multiplied
    # by 2 for bidirectional because it gets fed to the Decoder's LSTM which is 
    # always unidirectional.
    self.enc_out = nn.Linear(n_dirs * (n_h + n_h), emb_sz + emb_sz, bias=False)

  # ----------------------------
  # Process the forward pass
  # ----------------------------
  def forward(self, inp):
    if (isinstance(inp, tuple)):
      # yb is passed in only during training, by the Teacher Force Callback
      xb, yb = inp
    else:
      # yb is not passed during validation
      xb, yb = inp, None

    # Input has shape (samples, timestep)
    bs, _ = xb.size()

    # Initialise the LSTM hidden state and cell state
    init_h, init_c = self._init_hc(bs)

    # Input goes through embedding layer with dropout
    # emb_val and drop_val has shape (samples, timestep, embedding size)
    emb_val = self.enc_emb(xb)
    drop_val = self.enc_inp_drop(emb_val)

    # Without this flatten, the LSTM generates lots of warnings on each pass
    # Not sure why this happens
    self.enc_lstm.flatten_parameters()

    # Process the LSTM layer
    # 'enc_out' has hidden states from all timesteps of the last LSTM layer.
    # 'h' has hidden state from the last timestep of all LSTM layers and
    # 'c' has the cell state from the last timestep of all LSTM layers
    #
    # 'out' has shape (samples, timestep, hidden size * num directions)
    # 'h' and 'c' have shape (num layers * num directions, samples, hidden size)
    enc_out, (h, c) = self.enc_lstm(drop_val, (init_h, init_c))

    # Separate the hidden state by layers and direction, as per the description in
    # the Pytorch LSTM docs.
    h = h.view(self.n_layers, self.n_dirs, bs, self.n_h)
    # Now re-arrange so the direction dimension is second-last.
    h = h.permute(0, 2, 1, 3).contiguous()
    # Reshape again so that the direction gets multiplied into the last dimension
    # We do this because this will be fed into the Linear layer which expects
    # its input feature size in the last dimension.
    h = h.view(self.n_layers, bs, self.n_dirs * self.n_h)

    # Repeat the above for the cell state
    c = c.view(self.n_layers, self.n_dirs, bs, self.n_h)
    c = c.permute(0, 2, 1, 3).contiguous()
    c = c.view(self.n_layers, bs, self.n_dirs * self.n_h)

    hc = torch.cat([h, c], dim=2)
    hc = self.enc_out(self.enc_hid_drop(hc))
    emb_sz = hc.size(2) // 2
    h, c = hc[..., :emb_sz].contiguous(), hc[..., emb_sz:].contiguous()
    
    # Output a tuple of the hidden and cell states
    return h, c, enc_out, yb

  # ----------------------------
  # Initialise the hidden (h) and cell state (c)
  # ----------------------------
  def _init_hc(self, bs):
    # Use the data type and device from any (we take the first one) parameter 
    # of our model
    first_param = next(self.parameters())

    # 'h' and 'c' have shape (num layers * num directions, samples, hidden size)
    init_h = first_param.new_zeros(self.n_dirs * self.n_layers, bs, self.n_h)
    init_c = first_param.new_zeros(self.n_dirs * self.n_layers, bs, self.n_h)

    return init_h, init_c

#----------------------------------------------------
# Decoder Module
#----------------------------------------------------
class Seq_Decoder(nn.Module):
  #----------------------------------------------------
  # Build the architecture with an Embedding and LSTM layer, along with some dropouts
  # This is followed by a Linear layer to produce the output predictions
  #
  # !!!!!!!!!!! Drop probs and tok_idx hardcoded
  #----------------------------------------------------
  def __init__(self, n_layers, pad_idx, emb, enc_n_h, enc_n_dirs, dec_p=0.1, drop_p=0.35, tok_begin_idx=2, tok_end_idx=1):
    super().__init__()

    self.pad_idx, self.tok_begin_idx, self.tok_end_idx = pad_idx, tok_begin_idx, tok_end_idx
    self.teacher_force = 0

    # Embedding layer
    self.dec_emb = emb
    vocab_sz, emb_sz = emb.weight.size()

    # LSTM layer
    # We make the hidden size the same as the embedding size
    att_ctx_sz = enc_n_dirs * enc_n_h
    n_h = emb_sz
    self.dec_lstm = nn.LSTM(emb_sz + att_ctx_sz, n_h, n_layers, batch_first=True, dropout=dec_p)
    self.dec_out_drop = nn.Dropout(p=drop_p)

    # Output Linear layer
    self.dec_logit = nn.Linear(n_h, vocab_sz)
    self.dec_logit.weight.data = self.dec_emb.weight.data

    # Attention module
    self.enc_aln = nn.Linear(enc_n_dirs * enc_n_h, n_h, bias=False)
    self.dec_hid_aln = nn.Linear(n_h, n_h)
    self.param_aln = self.init_aln(n_h)

  def init_aln(self, *sz): 
    return nn.Parameter(torch.randn(sz)/math.sqrt(sz[0]))

  def calc_att(self, enc_out, h, dec_inp):
    # Calculate Alignment Scores - Encoder outputs and Decoder Hidden state are 
    # put through a Linear layer. Then add them and do a tanh. Then dot product
    # with an alignment vector 'param_aln'.
    enc_aln = self.enc_aln(enc_out)
    hid_aln = self.dec_hid_aln(h[-1])
    u = torch.tanh(enc_aln + hid_aln.unsqueeze(1))
    aln_scores = u @ self.param_aln

    # Softmax alignment scores to get Attention weights
    att_wgts = F.softmax(aln_scores, dim=1)

    # Multiply the Attention weights with encoder outputs to get the context vector
    att_ctx = (att_wgts.unsqueeze(-1) * enc_out).sum(1)

    #Concatenate context vector with embedded input word
    emb_val = self.dec_emb(dec_inp)
    att_dec_inp = torch.cat([emb_val, att_ctx], 1)
    att_dec_inp = att_dec_inp.unsqueeze(1)
    return att_dec_inp

  # ----------------------------
  # Process the forward pass
  # ----------------------------
  def forward(self, inp):
    # We get the tuple of hidden and cell state from the Encoder layer as input
    # During training we also get the target data, which we use for teacher forcing
    h, c, enc_out, yb = inp
    _, bs, _ = h.size()

    # Use the data type and device from any (we take the first one) parameter 
    # of our model to initialise the LSTM 's input word sequence 'dec_inp'
    # 'dec_inp' is initialised to a single word (viz. the starting token) and has shape (samples, )
    first_param = next(self.parameters())
    dec_inp = first_param.new_full((bs, ), self.tok_begin_idx, dtype=torch.long)

    # We explicitly go through a loop one word at a time (ie. single LSTM timestep at a time), rather 
    # rather than use the LSTM's built-in loop. We do this because we want to feed the output word
    # of one timestep as the input word of the next time step
    sentence = []
    i, max_loop = 0, 30
    end = False
    while ((i < max_loop) and (not end)):
      att_dec_inp = self.calc_att(enc_out, h, dec_inp)

      # 'x' goes through the embedding layer, which outputs shape (samples, embedding size)
      # The unsqueeze inserts a 2nd dimension for the timestep with a value of 1 since
      # we are always doing a single timestep (ie. our sequence length is always 1)
      #emb_val = self.dec_emb(dec_inp).unsqueeze(1)

      # Without this flatten, the LSTM generates lots of warnings on each pass
      # Not sure why this happens
      self.dec_lstm.flatten_parameters()

      # Process the LSTM layer
      # 'out' has hidden states from all timesteps of the last LSTM layer.
      # 'h' has hidden state from the last timestep of all LSTM layers and
      # 'c' has the cell state from the last timestep of all LSTM layers
      #
      # 'out' has shape (samples, timestep, hidden size * num directions)
      # 'h' and 'c' have shape (num layers * num directions, samples, hidden size)
      out, (h, c) = self.dec_lstm(att_dec_inp, (h, c))
      
      # Take only the first timestep of the output
      # 'drop_val' has shape (samples, hidden size * num directions)
      drop_val = self.dec_out_drop(out[:, 0, ...])

      # Linear layer generates logit scores for each word in the vocab
      # 'word_logits' shape is (samples, vocab size)
      word_logits = self.dec_logit(drop_val)

      # Append the scores for the full vocab to our emerging sentence
      # Sentence is a list [scores for timestep 1, scores for timestep 2, ...] where
      # we have the scores for each word in the vocab at each timestep
      sentence.append(word_logits)

      # Get the vocab index of the word with the highest score. This is the
      # predicted word at this position in the sequence
      word_idx = word_logits.argmax(dim=1)

      if (word_idx == self.pad_idx).all():
        # End the loop if we just predicted a padding token, which means we've
        # reached the end of the sequence
        end = True

      elif ((yb is not None) and (random.random() < self.teacher_force)):
        if (i >= yb.size(1)):
          end = True
        else:
          # With teacher forcing, we feed the correct target word as the input word for
          # the next timestep.
          dec_inp = yb[:, i]

      else:
        # Take the word we just predicted and feed that as the input word for 
        # the next timestep. Note that here, as we go through loop, we are 
        # using just one word with the highest predicted score.
        x = word_idx

      # Go to the next timestep
      i += 1

    # Stack up all the timesteps from our generated sentence into one tensor
    # of shape (samples, timesteps, vocab size)
    #
    # Note that here we're keeping scores for all words in the vocab, not just the
    # word with the highest score.
    return torch.stack(sentence, dim=1)

#----------------------------------------------------
# Callback to aid with Teacher Forcing
#----------------------------------------------------
class TeacherForceCB(Callback):
  def __init__(self, max_epoch):
    self.max_epoch = max_epoch

  # ----------------------------
  # During training, package the input data as a tuple (xb, yb) containing both the
  # 'x' and 'y' targets
  # ----------------------------
  def begin_tr_batch(self, ctx):
    ctx.xb = (ctx.xb, ctx.yb)

  # ----------------------------
  # Adjust the teacher forcing hyper parameter as we progress through training
  # ----------------------------
  def begin_tr(self, ctx):
    # Find the Decoder module by traversing the model's children
    dec = None
    for name, module in ctx.model.named_children():
      if (isinstance(module, Seq_Decoder)):
        dec = module
    assert (dec is not None)

    # Decay the teacher forcing so that we do less of it as we progress through
    # training epochs. Since the model is now better trained it should be able to
    # produce the correct output without relying on teacher forcing 
    dec.teacher_force = 1 - 0.5 * ctx.i_epoch/self.max_epoch

#----------------------------------------------------
# Create the Translation Encoder-Decoder architecture
#----------------------------------------------------
class ArchTextTranslation():
  #----------------------------------------------------
  # Create the Encoder ('x') and Decoder ('y') models
  #----------------------------------------------------
  def create_model(self, vocab_y, enc_n_h, n_layers, enc_n_dirs, pad_idx_y):
    # Embedding layers should be pre-loaded
    assert ((self.emb_x is not None) and (self.emb_y is not None))

    # Create the Encoder and Decoder
    self.enc = Seq_Encoder(enc_n_h, n_layers, enc_n_dirs, self.emb_x)
    self.dec = Seq_Decoder(n_layers, pad_idx_y, self.emb_y, enc_n_h, enc_n_dirs)

    # And wrap them in a Sequential
    self.model = nn.Sequential(self.enc, self.dec)

  #----------------------------------------------------
  # Set the embedding layers for the Encoder and Decoder
  # These must be done before creating the model
  #----------------------------------------------------
  def load_emb(self, emb_x, emb_y):
    # Encoder's embedding
    self.emb_x = emb_x
    #self.emb_sz_x = emb_sz_x

    # Decoder's embedding
    self.emb_y = emb_y
    #self.emb_sz_y = emb_sz_y

  #----------------------------------------------------
  # Pre-create the Embedding layer and initialise its weights using
  # the pre-trained embedding vectors
  #----------------------------------------------------
  @staticmethod
  def pre_process_embedding(vocab, pre_trained_wgts, emb_sz, pad_idx):
    vocab_sz = len(vocab)

    # Create the Embedding layer of shape [vocab, embedding size]
    emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_idx)
    
    # Go through each word in our vocab, and look up it's embedding
    # vector in the pre-trained vectors package. If we find it there
    # we use it to initialise the Embedding layer weight
    emb_data = emb.weight.data
    for i, word in enumerate(vocab):
      word_wgt = pre_trained_wgts.get_word_vector(word)
      if (word_wgt is not None):
        emb_data[i] = tensor(word_wgt)

    return emb

  #----------------------------------------------------
  # Utility method to pad the sequences in the predicted output
  # and the target output so that they are the same length
  # Used during loss and accuracy calculations to compare predicted output with target
  #----------------------------------------------------
  @staticmethod
  def _tt_pad_pred(out, targ, pad_idx=1):
    # Target has shape (samples, timestep) ie. (samples, sequence length)
    bs, targ_seq_len = targ.size()
    # Predicted output has shape (samples, timestep, vocab size) ie. (samples, sequence length, vocab size)
    out_bs, out_seq_len, out_vocab_sz = out.size()

    if (targ_seq_len > out_seq_len):
      # Pad the output if target is longer
      # F.pad arguments start from the last dimension and go backwards
      # ie. F.pad (3rd dim start, 3rd dim end, 2nd dim start, 2nd dim end, 1st dim start, 1st dim end)
      # ie. we pad the end of the output's 2nd dimension viz. sequence length
      out = F.pad(out, (0, 0, 0, targ_seq_len - out_seq_len, 0, 0), value=pad_idx)

    elif (out_seq_len > targ_seq_len):
      # Pad the target if output is longer
      # ie. F.pad (2nd dim start, 2nd dim end, 1st dim start, 1st dim end)
      # ie. we pad the end of the target's 2nd dimension viz. sequence length
      targ = F.pad(targ, (0, out_seq_len - targ_seq_len, 0, 0), value=pad_idx)
    return out, targ
  

  #----------------------------------------------------
  # Loss function uses cross_entropy loss between the predicted output and
  # targets. Note the cross_entropy internally calls softmax of the output
  # and then computes loss
  #----------------------------------------------------
  @classmethod
  def tt_loss(cls, out, targ, pad_idx=1):
    # First make output and target be the same length
    pad_out, pad_targ = cls._tt_pad_pred(out, targ, pad_idx)

    # Flatten the output and targets and then calculate cross_entropy
    return F.cross_entropy(pad_out.view(-1, out.size(2)), pad_targ.view(-1))

  #----------------------------------------------------
  # Loss function uses cross_entropy loss between the predicted output and
  # targets. Note the cross_entropy internally calls softmax of the output
  # and then computes loss
  #----------------------------------------------------
  @classmethod
  def tt_acc(cls, out, targ, pad_idx=1):
    # First make output and target be the same length
    pad_out, pad_targ = cls._tt_pad_pred(out, targ, pad_idx)

    # Target has shape (samples, timestep)
    # Predicted output has shape (samples, timestep, vocab size)

    # Get the word with the highest score at each timestep in the sequence
    pred_out = pad_out.argmax(2)

    # Compare the word at each timestep with the corresponding word in the target
    res = (pred_out == pad_targ).float().mean()
    return res

### Define Text Translation Data Bundle

In [None]:
#----------------------------------------------------
# Text Translation from CSV data preparation pipeline
#----------------------------------------------------
class TextTranslationCSVDataBundle(DataBundle):
  def __init__(self, csv_path, bs):
    print ('--------- Text Translation DataBundle init', csv_path)

    # Load all rows from the given CSV file
    # Split randomly based on a percentage ratio for training and validation
    # 'x' items are taken from 'fr' column as text sentences and
    # 'y' labels are taken from 'en' column as class name labels
    # Convert the 'x' items from Sentences to Words to Word Ids
    # Convert the 'y' items from Sentences to Words to Word Ids

    load_params = {'source': CSVItemContainer, 'target_cls': DfItemList, 'csv_path': csv_path}
    split_params = {'split_procedure': 'split_random', 'train_ratio': 0.8, 'valid_ratio': 0.2}
    extract_x_params = {'extract_procedure': 'extract_colval', 'target_cls': SentenceItemList, 'col': 'fr', 'lang': 'fr'}
    extract_y_params = {'extract_procedure': 'extract_colval', 'target_cls': SentenceItemList, 'col': 'en'}
    convert_x_params = [
        {'target_cls': SentenceWordItemList, 'convert_procedure': 'SentenceToWord'},
        {'target_cls': SentenceWordIdItemList, 'convert_procedure': 'WordToWordId'}
    ]
    convert_y_params = [
        {'target_cls': SentenceWordItemList, 'convert_procedure': 'SentenceToWord'},
        {'target_cls': SentenceWordIdItemList, 'convert_procedure': 'WordToWordId'}
    ]
    # We use different samples for training and validation. Also, the custom sampler functions need to take a sort-key-function
    # as an extra argument. The key function needs the 'data' argument pre-bound using a partial.
    len_key_fn = lambda i,data: len(data[i])
    dl_params = (
        {'bs': bs, 'sampler_fn': SortishSampler, 'key_fn': len_key_fn, 'collate_fn': seq_collate},    # for training
        {'bs': bs, 'sampler_fn': SortSampler, 'key_fn': len_key_fn, 'collate_fn': seq_collate}        # for valid/test
    )
    WAIT_dl_params = (
        {'bs': bs, 'shuffle': False, 'collate_fn': seq_collate},    # for training
        {'bs': bs, 'shuffle': False, 'collate_fn': seq_collate}     # for valid/test
    )
    super().__init__(load_params, split_params, extract_x_params, extract_y_params, convert_x_params, convert_y_params, dl_params=dl_params)

  # ----------------------------
  # Since we use Sorting Samplers which require a key function (which also requires a partial wrapper)
  # we have to override the parent DataBundle with a custom get_sampler() 
  # ----------------------------
  def get_sampler(self, ds, in_train, bs, sampler_fn, key_fn, **kwargs):
    key=partial(key_fn, data=ds.x)
    # The two sampler functions take different arguments. Since we know the sampler
    # functions we can hardcode their names. The sampler_fn that is passed in is the
    # same, but we ignore it.
    if (in_train):
      sampler = SortishSampler(ds.x, key=key, bs=bs)
    else:
      sampler = SortSampler(ds.x, key=key)
    return sampler

In [None]:
# ----------------------------
# Collate function to convert a list of item tuples into a single tensor which is fed
# as input to the model
#
# 'Samples' is a list of tuples ie. [(x1, y1), (x2, y2), ...] where 'xn' and 'yn' are 
# both lists of word ids in the sentence. All the sentences could have different lengths, so
# after each 'xn' and 'yn' are converted to tensors, they are padded at the end to
# make them the same length.
# ----------------------------
def seq_collate(samples, pad_idx=1, pad_first=False):
  # Convert each sample sentence into a tensor and then create two lists of 
  # tensors ie. [tensor x1, tensor x2, ...] and [tensor y1, tensor y2, ...]
  tx, ty = [torch.tensor(s[0]) for s in samples], [torch.tensor(s[1]) for s in samples]

  # Use the Pytorch pad_sequence function to pad each sample tensor to the same length and then
  # concatenate them into a single tensor. So 'px' and 'py' have shape (samples, max sequence length)
  px, py = pad_sequence(tx, batch_first=True, padding_value=pad_idx), pad_sequence(ty, batch_first=True, padding_value=pad_idx)
  return px, py

### Define Text Translation application class 

In [None]:
#----------------------------------------------------
# Text Translation Application. It translates text sentences from an 'x' language into
# a 'y' language, using a Sequence-to-Sequence Encoder-Decoder architecture.
#
# Here, we translate from French to English
#
# To use it, the steps are:
#   1. Download and pre-process the data, saving it in a CSV file
#   2. Download embedding vectors for both languages and pre-process them to 
#        initialise two Embedding layers and save them
#   3. Now, load the pre-processed data
#   4. Load the saved initialised Embedding layers
#   5. Create the architecture and train it
#   6. Run predictions
#----------------------------------------------------
class AppTextTranslation():

  def __init__(self):
    self._arch = None
    self.db = None
    self.vocab = None

  #----------------------------------------------------
  # Pre-process the text data for both languages. The dataset is huge, so we filter it
  # by selecting only sentences which are questions. The filtered data is then saved
  # into a CSV file.
  #----------------------------------------------------
  def pre_process_data(self, data_x_path, data_y_path, out_data_path):
    # Regular expressions to select questions for the 'y' (English) language by
    # searching for sentences beginning with 'Wh' and ending with '?'. For the 'x'
    # language we pick the full sentence.
    re_yq = re.compile('^(Wh[^?.!]+\?)')
    re_xq = re.compile('^([^?.!]+\?)')

    # Find all matching sentences from 'x' and 'y'
    xf = open(data_x_path, encoding='utf-8')
    yf = open(data_y_path, encoding='utf-8')
    lines = ((re_yq.search(yq), re_xq.search(xq)) for yq, xq in zip(yf, xf))

    # Pick out the sentences which matched
    qs = [(y.group(), x.group()) for y,x in lines if y and x]

    # Prepare a Pandas dataframe and save it to a CSV file
    df = pd.DataFrame({'fr': [q[1] for q in qs], 'en': [q[0] for q in qs]}, columns = ['en', 'fr'])
    #df['en'] = df['en'].apply(lambda x:x.lower())
    #df['fr'] = df['fr'].apply(lambda x:x.lower())
    df.to_csv(out_data_path, index=False)

    # Free memory as the dataset is huge
    del lines, qs
    return df

  #----------------------------------------------------
  # Pre-create the Embedding layer and initialise its weights using
  # the pre-trained embedding vectors.
  # 
  # Save the initialised Embedding layer
  #----------------------------------------------------
  def pre_process_embedding(self, pre_trained_wgts_path, emb_wgts_path, is_emb_x):
    assert ((self.vocab_x is not None) and (self.vocab_y is not None))

    # Load the pre-trained embedding vectors
    word_vecs = ft.load_model(str(pre_trained_wgts_path))

    # Get the appropriate vocab
    emb_vocab = self.vocab_x if is_emb_x else self.vocab_y
    emb_sz, pad_idx = 300, emb_vocab.index(PAD)

    # Create the Embedding layer and initialise its weights
    emb = ArchTextTranslation.pre_process_embedding(emb_vocab, word_vecs, emb_sz, pad_idx)

    # Save the initialised Embedding layer
    torch.save(emb, emb_wgts_path)

    # Free some memory as the pre-trained vectors are huge
    del word_vecs

  #----------------------------------------------------
  # Load the data using the Text Translation Data Bundle
  #----------------------------------------------------
  def load_data(self, file_path, bs):
    self.db = TextTranslationCSVDataBundle(file_path, bs)
    self.db.do()
    self.vocab_x = self.db.convert_state_x['vocab_i2w']
    self.vocab_y = self.db.convert_state_y['vocab_i2w']

  #----------------------------------------------------
  # Load the initialised Embedding layer that we saved during pre-processing
  #----------------------------------------------------
  def load_emb(self, emb_x_wgts_path, emb_y_wgts_path):
    # Load the saved 'x' and 'y' embedding data
    emb_x = torch.load(emb_x_wgts_path)
    emb_y = torch.load(emb_y_wgts_path)

    # Add them into our architecture
    self._arch = ArchTextTranslation()
    self._arch.load_emb(emb_x, emb_y)

  #----------------------------------------------------
  # Create the architecture
  # This assumes that the Embedding layers have already been loaded
  #----------------------------------------------------
  def create_arch(self):
    assert(self._arch is not None)
    assert ((self.vocab_x is not None) and (self.vocab_y is not None))
    enc_n_h, n_layers, n_dirs, pad_idx_y = 256, 2, 2, self.vocab_y.index(PAD)
    self._arch.create_model(self.vocab_y, enc_n_h, n_layers, n_dirs, pad_idx_y)
    return self._arch

  def create_fastai_gru(self):
    emb_x, emb_y = self._arch.emb_x, self._arch.emb_y
    self._arch.model = Seq2SeqGRU(emb_x, emb_y, 256, 30, n_layers=2, bos_idx=2)

  #----------------------------------------------------
  # Train the model
  #----------------------------------------------------
  def run_train(self, num_epochs=1, bleu=False):
    train_dl = self.db.train_dl
    valid_dl = self.db.valid_dl

    # Loss function
    loss_func = self._arch.tt_loss

    # Compute accuracy and optionally, Bleu scores
    metrics_dict = {"acc": self._arch.tt_acc}
    if (bleu):
      metrics_dict['bleu'] = corpus_bleu

    callbs=[CudaCB(device = torch.device('cuda',0)), ProgressCallback(), MetricsCB(metrics_dict), TeacherForceCB(max_epoch=num_epochs)]
    one_cycle = False
    if (one_cycle):
      one_cycle_callbs = create_OneCycleCB(split_lr, phases=[0.5, 0.5], mom_start=0.8, mom_mid=0.7, mom_end=0.8)
      callbs = callbs + one_cycle_callbs

    # Model
    model = self._arch.model

    # Optimiser
    opt_func=adam_opt_func
    lr = 1e-2
    opt_groups = None
    opt = get_optimiser(model, lr, opt_func, opt_groups)

    # Debug Tracker
    dtr = None
    if (False):
      dtr = DebugTracker(disp=(True, False))
      debug_cbs = [dtr, DebugYhatLossCB()]
      callbs = callbs + debug_cbs

    loop = Trainer(train_dl, valid_dl, model, opt, loss_func, callbs, dtr=dtr)
    loop.fit(num_epochs=num_epochs)
    return loop

  def run_predict(self):
    valid_dl = self.db.valid_dl
    self._arch.model.eval()
    device = list(self._arch.model.parameters())[0].device

    inps, outs, targs = [], [], []
    with torch.no_grad():
      for _, (xb, yb) in enumerate(valid_dl):
        xb = xb.to(device)
        yhat = self._arch.model(xb)
        pred = yhat.argmax(dim=2)
        for x, y, p in zip (xb, yb, pred):
          inp = [self.vocab_x[w] for w in x]
          inps.append(inp)
          out = [self.vocab_y[w] for w in p]
          outs.append(out)
          targ = [self.vocab_y[w] for w in y]
          targs.append(targ)
    return inps, outs, targs

### Define Data File Paths

In [None]:
root_path = Path.cwd()
data_path = root_path/'giga-fren'
data_file_path = data_path/'questions_easy.csv'
emb_x_wgts_path = data_path/'fr_emb.pth'
emb_y_wgts_path = data_path/'en_emb.pth'

### Fetch and Pre-Process Data

In [None]:
#----------------------------------------------------
# Download and unzip the data set. It consists of two huge
# files, one for English sentences and one for French sentences
#----------------------------------------------------


! wget https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz -P {root_path}
! tar xf {root_path}/giga-fren.tgz -C {root_path} 

data_path.ls()

In [None]:
#----------------------------------------------------
# Pre-process the data, filtering it to a manageable size and save it to
# a CSV file
#----------------------------------------------------

ttpp_app = AppTextTranslation()
data_x_path = data_path/'giga-fren.release2.fixed.fr'
data_y_path = data_path/'giga-fren.release2.fixed.en'
ppdf = ttpp_app.pre_process_data(data_x_path, data_y_path, data_file_path)

!rm {data_x_path}
!rm {data_y_path}

data_path.ls()
ppdf[:10]

In [None]:
#----------------------------------------------------
# Load the data bundle, because we need the vocabs when we
# pre-process embeddings in the next step
#----------------------------------------------------

ttpp_app.load_data(data_file_path, bs=32)

### Fetch FastText word embeddings

In [None]:
#----------------------------------------------------
# Download and install the Fasttext embedding library
#   https://fasttext.cc/docs/en/crawl-vectors.html
#----------------------------------------------------

! git clone https://github.com/facebookresearch/fastText.git
! cd fastText; pip install .
import fasttext as ft

In [None]:
#----------------------------------------------------
# Now get fastText pre-trained word embedding vectors for French
# NB: Each downloaded gzipped file is 4.2G and takes 6 minutes to download. 
# After unzipping, each file is about 7GB.
#
# Then, initialise an Embedding layer with those pre-trained weights
# Since this step consumes a lot of disk and memory, we can only pre-process one
# language at a time, then immediately free the disk space and memory.
#----------------------------------------------------

! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz -P {data_path}
! gzip -d {data_path}/cc.fr.300.bin.gz 
data_path.ls()

ttpp_app.pre_process_embedding(data_path/'cc.fr.300.bin', emb_x_wgts_path, True)

!rm {data_path}/cc.fr.300.bin

In [None]:
#----------------------------------------------------
# Now get the fastText pre-trained word embedding vectors for English
#----------------------------------------------------

! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz -P {data_path}
! gzip -d {data_path}/cc.en.300.bin.gz
data_path.ls()

ttpp_app.pre_process_embedding(data_path/'cc.en.300.bin', emb_y_wgts_path, False)

!rm {data_path}/cc.en.300.bin

### Bleu Score Metric

In [None]:
from math import exp

class NGram():
    def __init__(self, ngram, max_n=5000):
      #print('===', ngram)
      self.ngram,self.max_n = ngram,max_n
    def __eq__(self, other):
        if len(self.ngram) != len(other.ngram): return False
        return np.all(np.array(self.ngram) == np.array(other.ngram))
    def __hash__(self):
      hml = [o * self.max_n**i for i,o in enumerate(self.ngram)]
      hm = int(sum(hml))
      return hm

def get_grams(x, n, max_n=5000):
    return x if n==1 else [NGram(x[i:i+n], max_n=max_n) for i in range(len(x)-n+1)]

def get_correct_ngrams(pred, targ, n, max_n=5000):
    pred_grams,targ_grams = get_grams(pred, n, max_n=max_n),get_grams(targ, n, max_n=max_n)
    pred_cnt,targ_cnt = Counter(pred_grams),Counter(targ_grams)
    precm = [(c, targ_cnt[g]) for g,c in pred_cnt.items()]
    precl = [min(c, g) for c, g in precm]
    prec, ln = sum(precl),len(pred_grams)
    return prec, ln

def corpus_bleu(preds, targs, max_n=5000):
    pred_len,targ_len,n_precs,counts = 0,0,[0]*4,[0]*4
    tmp_preds = preds.argmax(dim=-1)
    tmp_preds, tmp_targs = tmp_preds.cpu().numpy(), targs.cpu().numpy()
    for pred,targ in zip(tmp_preds, tmp_targs):
        pred_len += len(pred)
        targ_len += len(targ)
        for i in range(4):
            c,t = get_correct_ngrams(pred, targ, i+1, max_n=max_n)
            n_precs[i] += c
            counts[i] += t
    #db.set_trace()
    n_precs = [c/t if (t > 0) else 0 for c,t in zip(n_precs,counts)]
    len_penalty = exp(1 - targ_len/pred_len) if pred_len < targ_len else 1
    return len_penalty * ((n_precs[0]*n_precs[1]*n_precs[2]*n_precs[3]) ** 0.25)

class CorpusBLEU(Callback):
    def __init__(self, vocab_sz):
        self.vocab_sz = vocab_sz
        self.name = 'bleu'
    
    def on_epoch_begin(self, **kwargs):
        self.pred_len,self.targ_len,self.n_precs,self.counts = 0,0,[0]*4,[0]*4
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        last_output = last_output.argmax(dim=-1)
        for pred,targ in zip(last_output.cpu().numpy(),last_target.cpu().numpy()):
            self.pred_len += len(pred)
            self.targ_len += len(targ)
            for i in range(4):
                c,t = get_correct_ngrams(pred, targ, i+1, max_n=self.vocab_sz)
                self.n_precs[i] += c
                self.counts[i] += t
    
    def on_epoch_end(self, last_metrics, **kwargs):
        n_precs = [c/t for c,t in zip(n_precs,counts)]
        len_penalty = exp(1 - targ_len/pred_len) if pred_len < targ_len else 1
        bleu = len_penalty * ((n_precs[0]*n_precs[1]*n_precs[2]*n_precs[3]) ** 0.25)
        return add_metrics(last_metrics, bleu)

### Test run the model

#### KD Model and Data

In [None]:
torch.manual_seed(0)
tt_app = AppTextTranslation()
tt_app.load_data(data_file_path, bs=64)
tt_app.load_emb(emb_x_wgts_path, emb_y_wgts_path)
tt_app.create_arch()
loop = tt_app.run_train(num_epochs=8)

In [None]:
tt_app.run_train(num_epochs=2, bleu=True)

33 {'momentum': 0.9, 'sqr_momentum': 0.99, 'eps': 1e-05, 'weight_decay': 0.0, 'lr': 0.01}


epoch,train_loss,train_acc,train_bleu,valid_loss,valid_acc,valid_bleu,time
0,2.263853,0.655897,0.474985,4.947572,0.159318,0.067105,02:35
1,2.501406,0.625128,0.452113,3.404715,0.349272,0.217035,02:32


<nb_training.Trainer at 0x7ff48003c438>

In [None]:
inps, outs, targs = tt_app.run_predict()
inps[70], outs[70], targs[70]

#### KD model, Fastai data

In [None]:
torch.manual_seed(0)
tt_app = AppTextTranslation()
tt_app.db = data
tt_app.vocab_x = data.x.vocab.itos
tt_app.vocab_y = data.y.vocab.itos
tt_app.load_emb(emb_x_wgts_path, emb_y_wgts_path)
tt_app.create_arch()
loop = tt_app.run_train(num_epochs=8)

33 {'momentum': 0.9, 'sqr_momentum': 0.99, 'eps': 1e-05, 'weight_decay': 0.0, 'lr': 0.01}


epoch,train_loss,train_acc,valid_loss,valid_acc,time
0,4.509823,0.427305,6.036685,0.096519,01:11
1,3.15415,0.555705,3.871483,0.141544,01:19
2,3.085337,0.558915,4.096089,0.129306,01:18
3,3.119189,0.558114,4.306132,0.122286,01:20
4,3.161039,0.555018,4.191979,0.298654,01:19
5,3.189976,0.550393,3.801612,0.440781,01:17
6,3.241537,0.544031,5.011482,0.325765,01:14
7,3.378329,0.528495,5.470052,0.284828,01:13


#### KD Trainer, Fastai data and model

In [None]:
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

torch.manual_seed(0)
tt_app = AppTextTranslation()
tt_app.db = data
tt_app.vocab_x = data.x.vocab.itos
tt_app.vocab_y = data.y.vocab.itos
tt_app.load_emb(emb_x_wgts_path, emb_y_wgts_path)
tt_app.create_fastai_gru()
loop = tt_app.run_train(num_epochs=8)

#### KD Trainer, KD data, Fastai GRU model

In [None]:
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

def one_param(mdl):
  first_param = next(mdl.parameters())
  return first_param

torch.manual_seed(0)
tt_app = AppTextTranslation()
tt_app.load_data(data_file_path, bs=64)
tt_app.load_emb(emb_x_wgts_path, emb_y_wgts_path)
tt_app.create_fastai_gru()
loop = tt_app.run_train(num_epochs=8)

In [None]:
tt_app.create_fastai_gru()
loop = tt_app.run_train(num_epochs=8)

### Fastai Data load

In [None]:
from fastai.text import *

In [None]:
path = Path.cwd()/'giga-fren'

In [None]:
df = pd.read_csv(path/'questions_easy.csv')
df.head()

Unnamed: 0,en,fr
0,What is light ?,Qu’est-ce que la lumière?
1,Who are we?,Où sommes-nous?
2,Where did we come from?,D'où venons-nous?
3,What would we do without it?,Que ferions-nous sans elle ?
4,What is the absolute location (latitude and lo...,Quelle sont les coordonnées (latitude et longi...


In [None]:
df['en'] = df['en'].apply(lambda x:x.lower())
df['fr'] = df['fr'].apply(lambda x:x.lower())

In [None]:
def seq2seq_collate(samples:BatchSamples, pad_idx:int=1, pad_first:bool=True, backwards:bool=False) -> Tuple[LongTensor, LongTensor]:
    "Function that collect samples and adds padding. Flips token order if needed"
    #db.set_trace()
    samples = to_data(samples)
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x).long() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: 
            res_x[i,-len(s[0]):],res_y[i,-len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
        else:         
            res_x[i,:len(s[0]):],res_y[i,:len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
    if backwards: res_x,res_y = res_x.flip(1),res_y.flip(1)
    return res_x,res_y

In [None]:
import IPython.core.debugger as db
class Seq2SeqDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training an RNN classifier."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,
               pad_first=False, device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        #db.set_trace()
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(seq2seq_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)
        del dl_kwargs['dl_tfms']
        # !!!!!!!
        dl_kwargs['num_workers']=0
        #train_sampler=None
        #collate_fn = partial(seq_collate, pad_idx=pad_idx)
        #dl_kwargs['collate_fn'] = collate_fn

        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
  
            # !!!!!!!!!!!
            #sampler=None

            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, collate_fn=collate_fn, no_check=no_check)

In [None]:
class Seq2SeqTextList(TextList):
    _bunch = Seq2SeqDataBunch
    _label_cls = TextList

In [None]:
src = Seq2SeqTextList.from_df(df, path = path, cols='fr').split_by_rand_pct().label_from_df(cols='en', label_cls=TextList)

In [None]:
len(src.train) + len(src.valid)

52331

In [None]:
data = src.databunch()

In [None]:
del src, df

In [None]:
data.save()

In [None]:
del data

In [None]:
data = load_data(path)

In [None]:
data.show_batch()

text,target
"xxbos xxmaj dans un tel cas , où il s’agit d ’ xxunk si un nom commercial a un fondement juridique antérieur à celui d ’ une marque aux fins de l’article 16 , paragraphe 1 , troisième phrase , de l’accord xxup adpic , peut - on considérer comme décisif : i ) le fait que , dans l’état où la marque est enregistrée et sa protection réclamée ,","xxbos xxmaj when assessing , in such a case , whether a trade name has a legal basis prior to a trade mark for the purposes of the third sentence of xxmaj article 16(1 ) of the trips xxmaj agreement , may it thus be considered as decisive : ( i ) whether the trade name was well known at least to some extent among the relevant trade xxunk in the xxmaj state in which the trade mark is registered and in which protection is sought for it , before the point in time at which registration of the trade mark was applied for in the xxmaj state in question ; or whether the trade name was used in commerce directed to the xxmaj state in which the trade mark is registered and in which protection is sought for it , before the point in time at which registration of the trade mark was applied for in the xxmaj state in question ; or what other factor may decide whether the trade name is to be regarded as an existing prior right within the meaning of the third sentence of xxmaj article 16(1 ) of the trips xxmaj agreement ?"
"xxbos xxmaj et selon vous , quels sont actuellement les principaux obstacles au bien - être en xxmaj europe : le manque d'argent , la pénurie d'emplois satisfaisants , la pression excessive sur la vie familiale et les loisirs , la pauvreté et xxunk , xxunk des services publics comme ceux de la santé et l'éducation , l'importance de la xxunk et de la criminalité , xxunk , les problèmes",xxbos xxmaj what else ?
"xxbos xxmaj qu’est - ce qui xxunk les fournisseurs de soins de santé à utiliser le système et qu’est - ce qui pourrait servir en tant xxunk non xxunk ou en tant que barrière à la xxup xxunk ; la nécessité de répondre serait - elle perçue comme une perte de temps si , là où les soins sont prodigués , on recherche un accès rapide à des informations en","xxbos xxmaj what would motivate health care providers to use the system , and what would serve as a xxunk or barrier to xxup kt – would the need to respond be perceived of as a waste of time if , at the point of care , they are looking for quick access to xxunk - sized information ?"
"xxbos xxmaj pourquoi , compte tenu de tout ce que nous savons au sujet des causes de xxunk , du fait de l’existence de techniques de diagnostic simples et précises et du fait que nous avons des interventions préventives et des traitements efficaces , xxunk demeure - t - elle encore aujourd’hui non xxunk , non traitée et contrôlée de façon inefficace pour tant de xxmaj canadiens et de xxmaj","xxbos xxmaj why , given how much we know about the causes of hypertension , the fact that simple and accurate techniques for diagnosis exist , and that we have effective preventive and treatment interventions , does hypertension remain xxunk , xxunk or not effectively controlled for so many xxmaj canadians ?"
"xxbos xxmaj qui peut nier les exigences et les inquiétudes de ceux qui se préoccupent de la reddition des comptes , qui soutiennent que cette nouvelle ère de mondialisation a créé un « déficit démocratique » , les gouvernements xxunk une partie de leur pouvoir et de leur influence , tandis que les regroupements xxunk – et non démocratiques – de tous genres voient xxunk leur pouvoir et leur influence","xxbos xxmaj who can deny the claims and concerns of those xxunk with accountability , who maintain that this new era of globalization has brought a "" democratic deficit , "" with governments losing power and influence while horizontal – and non - democratic – bodies of all types see their power and influence grow ?"


### Fastai model

In [None]:
class Seq2SeqGRU(nn.Module):
    def __init__(self, emb_enc, emb_dec, n_hid, max_len, n_layers=2, p_inp:float=0.15, p_enc:float=0.25, 
                 p_dec:float=0.1, p_out:float=0.35, p_hid:float=0.05, bos_idx:int=0, pad_idx:int=1):
        super().__init__()
        self.n_layers,self.n_hid,self.max_len,self.bos_idx,self.pad_idx = n_layers,n_hid,max_len,bos_idx,pad_idx
        self.emb_enc = emb_enc
        self.emb_enc_drop = nn.Dropout(p_inp)
        self.encoder = nn.GRU(emb_enc.weight.size(1), n_hid, n_layers, batch_first=True, dropout=p_enc)
        self.out_enc = nn.Linear(n_hid, emb_enc.weight.size(1), bias=False)
        self.hid_dp  = nn.Dropout(p_hid)
        self.emb_dec = emb_dec
        self.decoder = nn.GRU(emb_dec.weight.size(1), emb_dec.weight.size(1), n_layers, batch_first=True, dropout=p_dec)
        self.out_drop = nn.Dropout(p_out)
        self.out = nn.Linear(emb_dec.weight.size(1), emb_dec.weight.size(0))
        self.out.weight.data = self.emb_dec.weight.data
        
    def forward(self, inp):
        bs,sl = inp.size()
        #self.encoder.reset()
        #self.decoder.reset()
        hid = self.initHidden(bs)
        emb = self.emb_enc_drop(self.emb_enc(inp))
        #db.set_trace()
        enc_out, hid = self.encoder(emb, hid)
        hid = self.out_enc(self.hid_dp(hid))

        dec_inp = inp.new_zeros(bs).long() + self.bos_idx
        outs = []
        for i in range(self.max_len):
            emb = self.emb_dec(dec_inp).unsqueeze(1)
            out, hid = self.decoder(emb, hid)
            out = self.out(self.out_drop(out[:,0]))
            outs.append(out)
            dec_inp = out.max(1)[1]
            if (dec_inp==self.pad_idx).all(): break
        return torch.stack(outs, dim=1)
    
    def initHidden(self, bs): 
      h = one_param(self).new_zeros(self.n_layers, bs, self.n_hid)
      c = one_param(self).new_zeros(self.n_layers, bs, self.n_hid)
      return h

In [None]:
def seq2seq_loss(out, targ, pad_idx=1):
    bs,targ_len = targ.size()
    _,out_len,vs = out.size()
    if targ_len>out_len: out  = F.pad(out,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
    if out_len>targ_len: targ = F.pad(targ, (0,out_len-targ_len,0,0), value=pad_idx)
    return CrossEntropyFlat()(out, targ)

In [None]:
def seq2seq_acc(out, targ, pad_idx=1):
    bs,targ_len = targ.size()
    _,out_len,vs = out.size()
    if targ_len>out_len: out  = F.pad(out,  (0,0,0,targ_len-out_len,0,0), value=pad_idx)
    if out_len>targ_len: targ = F.pad(targ, (0,out_len-targ_len,0,0), value=pad_idx)
    out = out.argmax(2)
    return (out==targ).float().mean()

In [None]:
emb_enc = torch.load(path/'fr_emb.pth')
emb_dec = torch.load(path/'en_emb.pth')

In [None]:
model = Seq2SeqGRU(emb_enc, emb_dec, 256, 30, n_layers=2)
learn = Learner(data, model, loss_func=seq2seq_loss, metrics=[seq2seq_acc])

In [None]:
learn.fit_one_cycle(8, 1e-2)

epoch,train_loss,valid_loss,seq2seq_acc,time
0,6.523319,6.444037,0.198191,00:33
1,6.386384,6.0448,0.239319,00:31
2,5.841321,5.97684,0.240092,00:34
3,5.475042,5.4424,0.28955,00:37
4,5.054673,5.665747,0.267344,00:38
5,4.912147,5.174346,0.313426,00:39
6,4.52946,4.900515,0.33944,00:41
7,4.320996,4.802443,0.349645,00:42


### Results

Fast**ai notebook QRNN**

In [None]:
epoch	train_loss	valid_loss	seq2seq_acc	bleu	time
0	6.548347	6.254890	0.202597	0.089246	01:01
1	5.807833	5.634194	0.261195	0.209837	01:00
2	4.971908	5.273254	0.294139	0.227867	01:09
3	4.652781	4.496987	0.370694	0.282690	01:13
4	4.132740	4.727255	0.343164	0.272475	01:15
5	3.622683	4.128991	0.403503	0.314859	01:20
6	3.116825	3.976531	0.422250	0.332448	01:24
7	2.673768	3.869482	0.434256	0.340629	01:27

epoch	train_loss	valid_loss	seq2seq_acc	bleu	time
0	6.628302	6.688766	0.171176	0.049567	01:01
1	5.611732	5.369258	0.288863	0.202848	01:03
2	5.017492	5.475257	0.274479	0.208970	01:11
3	4.563599	4.983734	0.320529	0.251945	01:19
4	4.277009	4.489471	0.370596	0.298212	01:19
5	3.556834	4.304761	0.387561	0.310245	01:21
6	3.186942	4.047068	0.415123	0.328579	01:27
7	2.954225	3.975845	0.423042	0.333376	01:27

**Fastai notebook GRU**

In [None]:
epoch	train_loss	valid_loss	seq2seq_acc	bleu	time
0	5.858428	5.708938	0.252170	0.125969	00:52
1	5.357081	5.876677	0.238593	0.182234	00:51
2	4.927831	5.268726	0.293074	0.235639	00:54
3	4.610146	5.358156	0.283638	0.224705	00:53
4	4.505199	4.985784	0.318191	0.242152	00:53
5	4.107853	4.537851	0.364606	0.295078	00:58
6	3.589562	4.352857	0.383214	0.300911	01:00
7	3.393849	4.046872	0.415622	0.327335	01:02

epoch	train_loss	valid_loss	seq2seq_acc	bleu	time
0	6.015546	6.040129	0.225212	0.148938	00:53
1	5.336372	5.852052	0.243333	0.169163	00:52
2	5.237682	4.871711	0.336286	0.244732	00:54
3	5.080400	4.824254	0.339621	0.263153	00:55
4	4.967921	5.072886	0.313548	0.233096	00:54
5	4.413047	4.835425	0.336780	0.254155	00:56
6	4.215590	4.685118	0.352372	0.267846	00:57
7	4.235335	4.585082	0.362711	0.275471	00:55

**KD notebook, Fastai GRU model, Fastai data**

In [None]:
epoch	train_loss	valid_loss	seq2seq_acc	time
0	6.572704	6.686001	0.185240	00:33
1	6.240807	6.094747	0.233083	00:33
2	5.142201	5.782446	0.257133	00:36
3	5.209909	5.562961	0.276281	00:40
4	4.950634	4.924454	0.337162	00:41
5	4.485311	4.694997	0.358365	00:41
6	4.078117	4.806749	0.347632	00:42
7	3.657755	4.505972	0.378077	00:43

epoch	train_loss	valid_loss	seq2seq_acc	time
0	6.623886	6.648812	0.184976	00:32
1	6.121196	6.895894	0.173997	00:32
2	5.800183	5.728360	0.262508	00:35
3	5.212308	6.200798	0.222457	00:37
4	4.995849	5.994400	0.235996	00:39
5	4.877837	5.750614	0.258163	00:39
6	4.419581	5.157444	0.314034	00:41
7	4.353944	5.026139	0.327703	00:41

epoch	train_loss	valid_loss	seq2seq_acc	time
0	6.523319	6.444037	0.198191	00:33
1	6.386384	6.044800	0.239319	00:31
2	5.841321	5.976840	0.240092	00:34
3	5.475042	5.442400	0.289550	00:37
4	5.054673	5.665747	0.267344	00:38
5	4.912147	5.174346	0.313426	00:39
6	4.529460	4.900515	0.339440	00:41
7	4.320996	4.802443	0.349645	00:42

**KD notebook, KD Trainer, KD data, Fastai GRU model**

In [None]:
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.303920	0.219989	4.949887	0.334334	00:35
1	5.550987	0.271474	5.841299	0.237362	00:38
2	5.202934	0.296545	4.933283	0.320496	00:39
3	4.997673	0.313595	5.334086	0.276459	00:40
4	4.803404	0.328898	4.728201	0.335890	00:41
5	5.031213	0.364444	6.475720	0.266065	00:44
6	5.580777	0.361593	4.434995	0.336225	00:47
7	5.672724	0.336017	4.458024	0.428072	00:45

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.584579	0.200409	5.604022	0.286170	00:33
1	6.219455	0.261237	3.918784	0.506129	00:39
2	6.089532	0.285646	4.404509	0.444510	00:41
3	5.939508	0.299954	4.831132	0.156349	00:42
4	5.846473	0.309220	4.352937	0.377930	00:43
5	5.821789	0.311438	4.186678	0.342489	00:43
6	5.794986	0.315718	4.859937	0.422939	00:42
7	5.747003	0.319018	4.219995	0.325707	00:44

**KD notebook, KD Trainer, Fastai data, Fastai GRU model**

In [None]:
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.365257	0.235857	6.005346	0.232613	00:38
1	5.213785	0.332450	4.885948	0.337639	00:46
2	4.809270	0.369385	6.621243	0.199825	00:46
3	5.111947	0.357351	3.876555	0.380233	00:49
4	4.923338	0.364496	3.482557	0.499148	00:48
5	4.982679	0.359597	3.757394	0.466488	00:48
6	4.945480	0.363537	3.863272	0.362346	00:49
7	4.993021	0.362597	4.038907	0.471316	00:50

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.317437	0.148277	7.418691	0.137883	00:32
1	6.872702	0.235538	7.856993	0.119390	00:36
2	6.667437	0.254984	4.610329	0.102703	00:42
3	6.731552	0.249281	4.564480	0.451309	00:42
4	6.749056	0.249370	5.770372	0.349853	00:40
5	6.352491	0.282954	7.820367	0.128445	00:41
6	6.003761	0.319668	7.120711	0.186599	00:45
7	6.067753	0.310924	5.295717	0.377609	00:46

**KD notebook, KD model, Fastai data**

In [None]:
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.095515	0.140976	7.180361	0.138115	00:28
1	6.954101	0.150736	7.077180	0.140501	00:29
2	6.509344	0.184172	6.831913	0.150081	00:33
3	6.512704	0.182783	6.799678	0.149093	00:33
4	6.481158	0.184784	6.855497	0.148470	00:33
5	6.296695	0.202436	6.656479	0.157314	00:34
6	6.042625	0.227045	6.396843	0.179056	00:37
7	5.818643	0.251454	6.193115	0.195729	00:38

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.144305	0.144120	7.115142	0.139994	00:31
1	7.200583	0.132423	7.085549	0.141792	00:28
2	7.234757	0.127989	6.967595	0.145785	00:27
3	7.228754	0.128988	7.073987	0.142612	00:27
4	7.204069	0.130177	7.017547	0.143231	00:26
5	7.135895	0.136813	7.021854	0.151495	00:27
6	6.947525	0.154177	6.898959	0.159648	00:28
7	6.428139	0.198624	6.427719	0.183806	00:33

# ---- After tying decoder weights to the embedding
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.388091	0.137258	6.970419	0.176699	00:30
1	6.929803	0.174572	6.884960	0.168512	00:32
2	5.901417	0.264275	5.856327	0.245887	00:41
3	5.683479	0.282867	5.919020	0.240537	00:42
4	5.632364	0.283608	5.929564	0.239965	00:42
5	5.354821	0.315040	5.572811	0.274488	00:44
6	5.304469	0.315708	5.532499	0.277871	00:43
7	5.095870	0.337317	5.445774	0.285496	00:44

# ---- After linear layer to encoder
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.329444	0.136586	7.392494	0.137778	00:29
1	7.256671	0.140552	6.821441	0.160257	00:30
2	6.483342	0.199900	7.003404	0.153175	00:36
3	6.385310	0.207547	6.501233	0.181700	00:37
4	6.088735	0.233530	6.362590	0.191171	00:39
5	5.880104	0.251751	6.183613	0.206138	00:40
6	5.775515	0.260812	5.982253	0.228642	00:40
7	5.646072	0.276575	5.791481	0.248229	00:41

# ---- After bidirectional
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	7.324201	0.144511	7.050848	0.155531	00:36
1	7.234653	0.145592	7.012827	0.159076	00:35
2	6.178216	0.232374	6.090143	0.223361	00:44
3	5.653195	0.279862	5.989206	0.235150	00:47
4	5.496807	0.295095	5.844776	0.247888	00:47
5	5.202925	0.328509	5.687572	0.260018	00:51
6	5.071029	0.341414	5.250099	0.303111	00:51
7	5.036983	0.341353	5.198389	0.310094	00:49

# ---- After teacher forcing
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	3.204214	0.560256	7.267510	0.253753	00:57
1	2.933727	0.586231	5.940575	0.315763	00:58
2	2.915251	0.584706	4.612234	0.493744	00:59
3	2.980927	0.577221	4.132407	0.512331	00:59
4	3.058429	0.568003	3.860979	0.512101	00:58
5	3.219140	0.550020	3.861038	0.461460	00:58
6	3.301164	0.540253	5.424986	0.292765	00:56
7	3.484237	0.519709	6.199779	0.221711	00:56

# ---- After attention
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	4.509823	0.427305	6.036685	0.096519	01:11
1	3.154150	0.555705	3.871483	0.141544	01:19
2	3.085337	0.558915	4.096089	0.129306	01:18
3	3.119189	0.558114	4.306132	0.122286	01:20
4	3.161039	0.555018	4.191979	0.298654	01:19
5	3.189976	0.550393	3.801612	0.440781	01:17
6	3.241537	0.544031	5.011482	0.325765	01:14
7	3.378329	0.528495	5.470052	0.284828	01:13

**KD notebook KD model and data**

In [None]:
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.442913	0.188435	6.430400	0.174685	00:35
1	6.223257	0.206093	6.504838	0.170128	00:35
2	5.900857	0.230201	6.116207	0.205968	00:36
3	5.791644	0.241393	5.771810	0.240969	00:36
4	5.724015	0.250458	5.722351	0.246915	00:36
5	5.600645	0.261821	5.902134	0.230649	00:36
6	5.470247	0.274700	5.717124	0.247558	00:36
7	5.330138	0.287736	5.445066	0.272561	00:37

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.606551	0.184464	6.415832	0.181752	00:33
1	6.082189	0.216943	6.462260	0.184262	00:33
2	5.850541	0.240375	5.907950	0.231549	00:35
3	5.657823	0.258734	5.456781	0.274527	00:36
4	5.476200	0.274681	5.639965	0.256353	00:35
5	5.347548	0.285866	5.469547	0.273026	00:35
6	5.222646	0.297312	5.450500	0.274432	00:36
7	5.111280	0.307473	5.185486	0.300422	00:36

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.427548	0.190131	6.196699	0.198735	00:34
1	5.895729	0.233361	6.493158	0.185520	00:35
2	5.713185	0.254113	5.520290	0.268809	00:36
3	5.550801	0.269458	5.710424	0.252127	00:36
4	5.427881	0.279963	5.456533	0.275197	00:36
5	5.304070	0.289925	5.427710	0.277752	00:36
6	5.254131	0.294898	5.284402	0.290690	00:37
7	5.139466	0.304394	5.261429	0.291899	00:37

# ---- After tying decoder weights to the embedding
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.336802	0.191463	6.325757	0.190538	00:35
1	6.079963	0.218550	5.971920	0.225640	00:35
2	5.712619	0.253319	5.692638	0.252419	00:37
3	5.525562	0.269920	5.607968	0.260689	00:37
4	5.336722	0.286840	5.378273	0.281291	00:38
5	5.208682	0.299469	5.162830	0.301583	00:38
6	5.081091	0.311044	5.352839	0.284253	00:39
7	4.934848	0.324427	5.264331	0.291669	00:39

# ---- After linear layer to encoder
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.111099	0.218719	5.632525	0.247189	00:38
1	5.847243	0.236785	6.023604	0.218529	00:38
2	5.626212	0.260971	6.067066	0.214797	00:38
3	5.417837	0.281631	5.644879	0.254899	00:39
4	5.328231	0.291000	5.320411	0.285774	00:39
5	5.136227	0.308858	5.074691	0.307235	00:39
6	5.039217	0.317713	5.244615	0.292767	00:39
7	4.949056	0.326647	4.955501	0.318856	00:41

# ---- After bidirectional
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	6.667135	0.175773	6.637607	0.172441	00:36
1	6.357167	0.191023	6.621116	0.167819	00:37
2	6.140676	0.209039	5.976311	0.220839	00:38
3	5.926323	0.230477	5.918930	0.229221	00:39
4	5.702494	0.254005	5.926701	0.229961	00:40
5	5.502676	0.273406	6.267332	0.204831	00:40
6	5.295349	0.294311	5.349190	0.284071	00:42
7	5.163258	0.307266	5.279955	0.291542	00:42

# ---- After teacher forcing
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	3.136916	0.565318	6.485387	0.428519	00:53
1	2.812268	0.600114	6.470203	0.313464	00:53
2	2.768636	0.602410	6.242881	0.302663	00:53
3	2.792644	0.598249	4.832201	0.427251	00:53
4	2.869761	0.588713	4.197283	0.439709	00:54
5	2.951091	0.578210	5.040192	0.380652	00:53
6	3.061481	0.564932	5.360363	0.300452	00:52
7	3.168579	0.551636	4.226570	0.363828	00:52

# ---- After attention
epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	3.329652	0.546453	7.376829	0.097947	01:09
1	2.818685	0.596728	6.224333	0.108751	01:11
2	2.749254	0.601359	5.560364	0.149290	01:10
3	2.757350	0.598105	4.307948	0.210291	01:10
4	2.773471	0.594558	3.415860	0.496982	01:09
5	2.818466	0.588552	3.518212	0.512504	01:09
6	2.902072	0.579083	5.071352	0.313786	01:08
7	3.015603	0.564599	4.657800	0.347938	01:07

epoch	train_loss	train_acc	valid_loss	valid_acc	time
0	3.275162	0.553106	5.916556	0.496572	01:09
1	2.745907	0.608469	3.327456	0.500166	01:09
2	2.668602	0.615200	4.880656	0.173311	01:09
3	2.630811	0.617174	3.602422	0.420391	01:09
4	2.650112	0.613514	3.356412	0.484790	01:09
5	2.710541	0.604049	3.387840	0.488266	01:08
6	2.760253	0.596044	3.960105	0.426867	01:07
7	2.862263	0.583537	4.520898	0.363927	01:07

# ---- Continue for epoch 9-10, with Bleu
epoch	train_loss	train_acc	train_bleu	valid_loss	valid_acc	valid_bleu	time
0	2.332421	0.649901	0.470472	3.477965	0.401052	0.207105	02:34
1	2.536170	0.621716	0.449206	3.123379	0.507017	0.298915	02:32