## Imports

In [1]:
import re
import string
import unicodedata
import random
import io

import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F

In [2]:
SOS_token = 0
EOS_token = 1

In [3]:
class Lang:
    # Helper class
    def __init__(self, name):
        self.name = name
        self.word2idx = {}
        self.idx2word = {}
        self.word2cnt = {}
        
        self.idx2word = {0:'<SOS>', 1:'<EOS>'}
        self.word2idx = {v:k for k,v in self.idx2word.items()}
        
        self.cnt = 2
        
    def word2idx(self):
        return self.word2idx
        
    def idx2word(self):
        return self.idx2word
    
    def word2count(self):
        return self.word2cnt
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.cnt
            self.idx2word[self.cnt] = word
            self.word2cnt[word] = 1
            self.cnt += 1
        else:
            self.word2cnt[word] += 1
    
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):                                      #? why convert 2 ascii
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(' +', ' ', s)
    return s

In [5]:
def read_langs(lang1, lang2, reverse = False):
    print('Reading lines ....')
    
    # read the file and split into lines
    lines = io.open('data/%s-%s.txt' % (lang1, lang2), encoding = 'utf-8').read().strip().split('\n')
    
    pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
        
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

#### To train something quickly, we’ll trim the data set to only relatively short and simple sentences. Here the maximum length is 10 words (that includes ending punctuation) and we’re filtering to sentences that translate to the form “I am” or “He is” etc. (accounting for apostrophes replaced earlier).

In [6]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)            
    
def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

In [7]:
def prepare_data(lang1, lang2, reverse = False):
    input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print(random.choice(pairs))
    print(random.choice(pairs))
    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    
    print("Counted words:")
    print(input_lang.name, input_lang.cnt)
    print(output_lang.name, output_lang.cnt)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)
print(random.choice(pairs))

Reading lines ....
Read 135842 sentence pairs
['j ai prepare le petit dejeuner .', 'i made breakfast .']
['tom remarqua plusieurs differences .', 'tom noticed several differences .']
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
fra 4489
eng 2925
['vous m evitez .', 'you re avoiding me .']


## seq2seq model

![](encoder.png)

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

### Simple Decoder
In the simplest seq2seq decoder we use only last output of the encoder. This last output is sometimes called the **context vector** as it encodes context from the entire sequence. This **context vector** is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an $input\ token$ and $hidden\ state$. The initial input token is the start-of-string $<SOS>$ token, and the first hidden state is the context vector (the encoder’s last hidden state).

![](SimpleDecoder.png)

In [9]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(in_features = hidden_size, out_features = output_size)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = F.relu(embedded)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

### Attention Decoder
If only the context vector is passed betweeen the encoder and decoder, that single vector carries the burden of encoding the entire sentence.

Attention allows the decoder network to **focus** on a different part of the encoder’s outputs for every step of the decoder’s own outputs. First we calculate a set of attention weights. These will be multiplied by the encoder output vectors to create a weighted combination. The result (called `attn_applied` in the code) should contain information about that specific part of the input sequence, and thus help the decoder choose the right output words.

NOTE :: Because there are sentences of all sizes in the training data, to actually create and train this layer we have to choose a **maximum sentence length (input length, for encoder outputs)** that it can apply to. Sentences of the maximum length will use all the attention weights, while shorter sentences will only use the first few.

![](AttnDecoderRNN.png)

In [10]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p = 0.1, max_length = MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(in_features = hidden_size * 2, out_features = max_length)
        self.attn_combine = nn.Linear(in_features = hidden_size * 2, out_features = hidden_size)
        self.dropout = nn.Dropout(p = dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(in_features = self.hidden_size, out_features = self.output_size)
        
    def forward(self, x, hidden, encoder_outputs):
        embedded = self.embedding(x).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        att_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), dim = 1)), dim = 1)
        att_applied = torch.bmm(att_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))              # batch-wise matrix multiplication
        
        output = self.attn_combine(torch.cat((embedded[0], att_applied[0]), dim = 1)).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim = 1)
        
        return output, hidden, att_weights
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

### Preparing training data

In [11]:
def indexes_from_sentence(lang, sentence):
    return [lang.word2idx[word] for word in sentence.split(' ')]

def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype = torch.long).view(-1, 1)

def tensors_from_pair(pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    
    return input_tensor, target_tensor

**Teacher forcing** is the concept of using the real target outputs as each next input, instead of using the decoder’s guess as the next input. Using teacher forcing causes it to converge faster but when the trained network is exploited, it may exhibit instability.

In [19]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length = MAX_LENGTH):
    encodder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
    
    loss = 0
    
    for ei in range(input_length):
        print(ei)
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encodder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
    
    decoder_input = torch.Tensor([[SOS_token]])
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_input = target_tensor[di]       # Teacher Forcing
            loss += criterion(decoder_output, target_tensor[di])
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()             # detach from history as input
            
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break
    
    loss.backward()
        
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item()/target_length

#### This is a helper function to print time elapsed and estimated time remaining given the current time and progress %.

In [13]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

#### The whole training process looks like this:
* Start a timer
* Initialize optimizers and criterion
* Create set of training pairs
* Start empty losses array for plotting

In [14]:
def train_iters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensors_from_pair(random.choice(pairs))
                      for _ in range(n_iters)]
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    show_plot(plot_losses)

In [15]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensor_from_sentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]])  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.idx2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [17]:
def evaluate_randomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [20]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.cnt, hidden_size)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.cnt, dropout_p=0.1)

train_iters(encoder1, attn_decoder1, 75000, print_every=5000)

0


TypeError: tuple indices must be integers or slices, not tuple

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

In [None]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")