https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

from __future__ import unicode_literals, print_function, division
import string
import random

Mounted at /content/drive


**Reading the dataset**
==================



In [5]:
data_path = "/content/drive/MyDrive/001 My Skills/002 CS Engineering   Automated Math (BPHC)/004 Data Science (DS)   Artificial Intelligence (AI)/005 Textual Data (Unstructured Data) (Sequential Data)/004 NLP Tasks/2. NLG Tasks (or Seq2Seq Tasks)/0. Machine Translation (MT)/1. MonoLingual Machine Translation (MT)/2. NMT/english_french_dataset.txt"
# you can download other sentence pair dataset from here - http://www.manythings.org/anki/
# or use WMT dataset - http://www.statmt.org/wmt16/

In [6]:
from io import open
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().strip().split('\n')

lines

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)',
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)',
 'Run!\tFile !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077454 (sacredceltic)',
 'Run!\tFilez !\tCC-BY 2.0 (France) Attribution: tatoeba

**Data Preprocessing**
==================



In [7]:
# The files are all in Unicode, to simplify we will turn Unicode characters to ASCII
import unicodedata
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)if unicodedata.category(c) != 'Mn')

# Make everything in lowercase and trim all the punctuations, and remove non-letter characters
import re
def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  return s

# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
pairs

[['go .', 'va !', 'cc by . france attribution tatoeba .org cm wittydev '],
 ['go .',
  'marche .',
  'cc by . france attribution tatoeba .org cm micsmithel '],
 ['go .', 'bouge !', 'cc by . france attribution tatoeba .org cm micsmithel '],
 ['hi .', 'salut !', 'cc by . france attribution tatoeba .org cm aiji '],
 ['hi .', 'salut .', 'cc by . france attribution tatoeba .org cm gillux '],
 ['run !',
  'cours !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'courez !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'prenez vos jambes a vos cous !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'file !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'filez !',
  'cc by . france attribution tatoeba .org papabear sacredceltic '],
 ['run !',
  'cours !',
  'cc by . france attribution tatoeba .org papabear franlexcois '],
 ['run !',
  'fuyez !',
  'cc 

In [8]:
print("Read %s sentence pairs" % len(pairs))

Read 190206 sentence pairs


In [9]:
# now since the original dataset is english to french but we need french to english language pair, we need to reverse it
pairs = [list(reversed(p)) for p in pairs]
pairs

[['cc by . france attribution tatoeba .org cm wittydev ', 'va !', 'go .'],
 ['cc by . france attribution tatoeba .org cm micsmithel ',
  'marche .',
  'go .'],
 ['cc by . france attribution tatoeba .org cm micsmithel ', 'bouge !', 'go .'],
 ['cc by . france attribution tatoeba .org cm aiji ', 'salut !', 'hi .'],
 ['cc by . france attribution tatoeba .org cm gillux ', 'salut .', 'hi .'],
 ['cc by . france attribution tatoeba .org papabear sacredceltic ',
  'cours !',
  'run !'],
 ['cc by . france attribution tatoeba .org papabear sacredceltic ',
  'courez !',
  'run !'],
 ['cc by . france attribution tatoeba .org papabear sacredceltic ',
  'prenez vos jambes a vos cous !',
  'run !'],
 ['cc by . france attribution tatoeba .org papabear sacredceltic ',
  'file !',
  'run !'],
 ['cc by . france attribution tatoeba .org papabear sacredceltic ',
  'filez !',
  'run !'],
 ['cc by . france attribution tatoeba .org papabear franlexcois ',
  'cours !',
  'run !'],
 ['cc by . france attribution 

In [10]:
'''
Since there are a lot of example sentences and we want to train something quickly, we'll trim the data set to only relatively short and
simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we're filtering to sentences that translate to
the form "I am" or "He is" etc. (accounting for apostrophes replaced earlier).
'''

MAX_LENGTH = 10
eng_prefixes = ("i am ", "i m ","he is", "he s ","she is", "she s ","you are", "you re ","we are", "we re ","they are", "they re ")

def filterPair(p):
  return len(p[1].split(' ')) < MAX_LENGTH and len(p[2].split(' ')) < MAX_LENGTH and p[2].startswith(eng_prefixes)

def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

pairs = filterPairs(pairs)
pairs

[['cc by . france attribution tatoeba .org sacredceltic sacredceltic ',
  'j ai ans .',
  'i m .'],
 ['cc by . france attribution tatoeba .org ck anthaus ',
  'je vais bien .',
  'i m ok .'],
 ['cc by . france attribution tatoeba .org ck christiane ',
  'ca va .',
  'i m ok .'],
 ['cc by . france attribution tatoeba .org ck aiji ',
  'je suis tom .',
  'i m tom .'],
 ['cc by . france attribution tatoeba .org cntrational sacredceltic ',
  'je suis gras .',
  'i m fat .'],
 ['cc by . france attribution tatoeba .org cntrational sacredceltic ',
  'je suis gros .',
  'i m fat .'],
 ['cc by . france attribution tatoeba .org ck nimfeo ',
  'je suis en forme .',
  'i m fit .'],
 ['cc by . france attribution tatoeba .org spamster sacredceltic ',
  'je suis touche !',
  'i m hit !'],
 ['cc by . france attribution tatoeba .org spamster sacredceltic ',
  'je suis touchee !',
  'i m hit !'],
 ['cc by . france attribution tatoeba .org ck sacredceltic ',
  'je suis malade .',
  'i m ill .'],
 ['cc by

In [11]:
print("Trimmed to %s sentence pairs" % len(pairs))

Trimmed to 13996 sentence pairs


In [12]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {} # Dictionary to store a unique index per word which will indicate the index where 1 will be present in the OHE of that word
        self.word2count = {} # dictionary to store count of each word in the corpus which will be used to replace rare words with the most frequent words later
        self.index2word = {0: "SOS", 1: "EOS"} # inverse dictionary which already has two words SOS and EOS 
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

input_lang = Lang('eng')
output_lang = Lang('fra')

In [13]:
for pair in pairs:
  input_lang.addSentence(pair[0])
  output_lang.addSentence(pair[1])

In [14]:
print(input_lang.name)
print(input_lang.word2index)
print(input_lang.word2count)
print(input_lang.index2word)
print(input_lang.n_words)

eng
{'cc': 2, 'by': 3, '.': 4, 'france': 5, 'attribution': 6, 'tatoeba': 7, '.org': 8, 'sacredceltic': 9, '': 10, 'ck': 11, 'anthaus': 12, 'christiane': 13, 'aiji': 14, 'cntrational': 15, 'nimfeo': 16, 'spamster': 17, 'sysko': 18, 'micsmithel': 19, 'cm': 20, 'darinmex': 21, 'qdii': 22, 'dreamk': 23, 'gillux': 24, 'joseph': 25, 'lukaszpp': 26, 'mamat': 27, 'yemana': 28, 'eldad': 29, 'zmoo': 30, 'pjer': 31, 'blabla': 32, 'vgigregg': 33, 'stevegrant': 34, 'saeb': 35, 'hennebert': 36, 'theachaean': 37, 'callmath': 38, 'rene': 39, 'languageexpert': 40, 'nomadsoul': 41, 'shishir': 42, 'adjusting': 43, 'sbgodin': 44, 'algideamygdale': 45, 'brauliobezerra': 46, 'le': 47, 'petit': 48, 'ane': 49, 'gris': 50, 'dj': 51, 'saidez': 52, 'papabear': 53, 'kebukebu': 54, 'feudrenais': 55, 'tras': 56, 'mouseneb': 57, 'vest': 58, 'dominiko': 59, 'amikema': 60, 'belgavox': 61, 'rovo': 62, 'firez': 63, 'julien': 64, 'pdc': 65, 'vortarulo': 66, 'baisong': 67, 'trang': 68, 'zifre': 69, 'lucasmg': 70, 'boscowi

In [15]:
print(output_lang.name)
print(output_lang.word2index)
print(output_lang.word2count)
print(output_lang.index2word)
print(output_lang.n_words)

fra
{'j': 2, 'ai': 3, 'ans': 4, '.': 5, 'je': 6, 'vais': 7, 'bien': 8, 'ca': 9, 'va': 10, 'suis': 11, 'tom': 12, 'gras': 13, 'gros': 14, 'en': 15, 'forme': 16, 'touche': 17, '!': 18, 'touchee': 19, 'malade': 20, 'triste': 21, 'un': 22, 'coup': 23, 'de': 24, 'cafard': 25, 'malheureux': 26, 'timide': 27, 'mouille': 28, 'mouillee': 29, 'il': 30, 'est': 31, 'revenu': 32, 'me': 33, 'revoila': 34, 'chauve': 35, 'occupe': 36, 'occupee': 37, 'calme': 38, 'froid': 39, 'detendu': 40, 'detendue': 41, 'sourd': 42, 'sourde': 43, 'fini': 44, 'juste': 45, 'la': 46, 'peau': 47, 'claire': 48, 'le': 49, 'teint': 50, 'clair': 51, 'rapide': 52, 'tout': 53, 'libre': 54, 'disponible': 55, 'repu': 56, 'rassasie': 57, 'partie': 58, 'content': 59, 'chez': 60, 'moi': 61, 'retard': 62, 'paresseux': 63, 'faineant': 64, 'paresseuse': 65, 'faineante': 66, 'paume': 67, 'perdue': 68, 'porte': 69, 'riche': 70, 'securite': 71, 'certain': 72, 'sur': 73, 'sure': 74, 'grande': 75, 'mince': 76, 'ordonne': 77, 'ordonnee': 7

In [16]:
'''
 -----------------------------------------------Preparing Training Data----------------------------------------
To train, for each pair we will need an input tensor (indexes of the words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the EOS token to both sequences.
'''

def indexesFromSentence(lang, sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

**Modelling, Training and Prediction**
=================




## **Defining Seq2Seq Model**

In [17]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Encoder







In [18]:
class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(EncoderRNN, self).__init__()

    #initializing parameters
    self.input_size = input_size
    self.hidden_size = hidden_size

    #initializing layers

    #input layer
    self.embedding = nn.Embedding(input_size, hidden_size)

    #single hidden layer of Gated RNN
    self.gru = nn.GRU(hidden_size, hidden_size)
    # use this is you want to use stacked Gated RNN instead of just a single Gated RNN layer
    #self.gru = nn.GRU(hidden_size, hidden_size, n_layers) 

    # no output layer

  def forward(self, input, hidden):
    embedded = self.embedding(input).view(1, 1, -1)
    output = embedded
    output, hidden = self.gru(output, hidden)
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

### Decoder


#### M1 - Decoder without attention

In [19]:
'''
----------------------------------------- Simple Decoder ------------------------------------
In this decoder we use only last output of the encoder as the context vector.
This context vector is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and
hidden state. The initial input token is the start-of-string ``<SOS>``
token, and the first hidden state is the context vector (the encoder's
last hidden state).
'''

class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size

    #input layer
    self.embedding = nn.Embedding(output_size, hidden_size)

    # Single hidden layer of Gated RNN
    self.gru = nn.GRU(hidden_size, hidden_size)
        
    #output layer
    self.out = nn.Linear(hidden_size, output_size)  # adding the optional linear layer - refer RNN in timeseries notes for more details
    self.softmax = nn.LogSoftmax(dim=1) # adding the optional softmax layer - refer RNN in timeseries notes for more details

  def forward(self, input, hidden):
    output = self.embedding(input).view(1, 1, -1)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden)
    output = self.softmax(self.out(output[0]))
    return output, hidden

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

#### M2 - Decoder with attention

In [21]:
'''
------------------------------------------------------Attention Decoder---------------------------------------

Instead of using a simple decoder as defined above, we want to use the attention mechanism 

If only the context vector is passed between the encoder and decoder,that single vector carries the burden of encoding the entire sentence.
Attention allows the decoder network to "focus" on a different part of the encoder's outputs for every step of the decoder's own outputs. First
we calculate a set of *attention weights*. These will be multiplied by the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.
'''

class AttnDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
    super(AttnDecoderRNN, self).__init__()
        
    # Define parameters
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.dropout_p = dropout_p
    self.max_length = max_length

    # Define layers

    #input layer
    self.embedding = nn.Embedding(self.output_size, self.hidden_size)

    # attention layer
    self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
    self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
    self.dropout = nn.Dropout(self.dropout_p)

    # single hidden layer of Gated RNN
    self.gru = nn.GRU(self.hidden_size, self.hidden_size)

    # output layer
    self.out = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, input, hidden, encoder_outputs):
    # Note: we run this one step at a time

    # Get the embedding of the current input word (last output word)
    embedded = self.embedding(input).view(1, 1, -1)
    embedded = self.dropout(embedded)
        
    # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
    attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
    attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0))
        
    # Combine embedded input word and last context, run through RNN
    output = torch.cat((embedded[0], attn_applied[0]), 1)
    output = self.attn_combine(output).unsqueeze(0)

    output = F.relu(output)
    output, hidden = self.gru(output, hidden)

    output = F.log_softmax(self.out(output[0]), dim=1)
    return output, hidden, attn_weights

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

## **Training Seq2Seq Model**




In [27]:
"""
"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.378.4095&rep=rep1&type=pdf>`__.

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation -
intuitively it has learned to represent the output grammar and can "pick
up" the meaning once the teacher tells it the first few words, but it
has not properly learned how to create the sentence from the translation
in the first place.

Because of the freedom PyTorch's autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
``teacher_forcing_ratio`` up to use more of it.
"""
teacher_forcing_ratio = 0.5

In [28]:
'''
To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.
'''

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
  encoder_hidden = encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

  loss = 0

  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    encoder_outputs[ei] = encoder_output[0, 0]

  decoder_input = torch.tensor([[SOS_token]], device=device)

  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      loss += criterion(decoder_output, target_tensor[di])
      decoder_input = target_tensor[di]  # Teacher forcing

  else:
    # Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
      topv, topi = decoder_output.topk(1)
      decoder_input = topi.squeeze().detach()  # detach from history as input

      loss += criterion(decoder_output, target_tensor[di])
      if decoder_input.item() == EOS_token:
        break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length

In [29]:
'''
Defining some helper functions to print time elapsed and estimate time remaining given the current time and progress %
'''

import time
import math

def asMinutes(s):
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)


def timeSince(since, percent):
  now = time.time()
  s = now - since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

'''
Plotting results using matplotlib
using the array of loss values``plot_losses`` saved while training.
'''
import matplotlib.pyplot as plt
%matplotlib inline

plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
  plt.figure()
  fig, ax = plt.subplots()
  # this locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=0.2)
  ax.yaxis.set_major_locator(loc)
  plt.plot(points)

In [33]:
'''
The whole training process looks like this:
-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.
'''

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
      training_pair = training_pairs[iter - 1]
      input_tensor = training_pair[0]
      target_tensor = training_pair[1]

      loss = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion)
      print_loss_total += loss
      plot_loss_total += loss

      if iter % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))

      if iter % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
'''
With all these helper functions in place (it looks like extra work, but
it makes it easier to run multiple experiments) we can actually
initialize a network and start training.

Remember that the input sentences were heavily filtered. For this small
dataset we can use relatively small networks of 256 hidden nodes and a
single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
reasonable results.

.. Note::
   If you run this notebook you can train, interrupt the kernel,
   evaluate, and continue training later. Comment out the lines where the
   encoder and decoder are initialized and run ``trainIters`` again.
'''

hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

## **Evaluation**






Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
'''
We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:
'''
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

## **Visualizing Attention**






A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run ``plt.matshow(attentions)`` to see attention output
displayed as a matrix, with the columns being input steps and rows being
output steps:

In [None]:
output_words, attentions = evaluate(encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

For a better viewing experience we will do the extra work of adding axes
and labels:




In [None]:
def showAttention(input_sentence, output_words, attentions):
  # Set up figure with colorbar
  fig = plt.figure()
  ax = fig.add_subplot(111)
  cax = ax.matshow(attentions.numpy(), cmap='bone')
  fig.colorbar(cax)

  # Set up axes
  ax.set_xticklabels([''] + input_sentence.split(' ') +['<EOS>'], rotation=90)
  ax.set_yticklabels([''] + output_words)

  # Show label at every tick
  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
  plt.show()

In [None]:
def evaluateAndShowAttention(input_sentence):
  output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
  print('input =', input_sentence)
  print('output =', ' '.join(output_words))
  showAttention(input_sentence, output_words, attentions)

In [None]:
evaluateAndShowAttention("elle a cinq ans de moins que moi .")


In [None]:
evaluateAndShowAttention("elle est trop petit .")


In [None]:
evaluateAndShowAttention("je ne crains pas de mourir .")


In [None]:
evaluateAndShowAttention("c est un jeune directeur plein de talent .")