# Translation with a sequence to sequence network and attention

We we create a model to perform translation from French to English, using a sequence to sequence network.

Note that we're going to be doing this from scratch. *torchtext* can handle much of the preprocessing in this tutorial.

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
from pathlib import Path
from tqdm import tqdm
from collections import namedtuple

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cpu')

In [4]:
data_path = Path('data/seq-to-seq/eng-fra.txt')

We will represent each **word** (instead of each letter) in a language as a one-hot vector. We will cheat and trim the data to only use a few thousand words per language.

We'll make a helper class with `word2index` and `index2word` dictionaries.

In [5]:
SOS_token = 0 # start of sentence
EOS_token = 1 # end of sentence

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1:"EOS"}
        self.n_words = 2  # count SOS and EOS
    
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalize_string(s):
    """Lowercase, trim, and remove non-letter characters"""
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def read_langs(lang1, lang2, reverse=False):
    lines = data_path.read_text().strip().split('\n')
    
    # split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    
    # reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    return input_lang, output_lang, pairs

Trim the data to short and simple sentences -- this is just a tutorial.

In [7]:
MAX_LENGTH = 10  # of sentences

eng_prefixes = (  # filter to sentences beginning with these prefixes
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)
    pairs = filter_pairs(pairs)
    for pair in tqdm(pairs):
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    return input_lang, output_lang, pairs

In [8]:
input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)

100%|██████████████████████████████████████████████████████████████████████| 10599/10599 [00:00<00:00, 240394.47it/s]


In [9]:
input_lang.name, input_lang.n_words

('fra', 4345)

In [10]:
output_lang.name, output_lang.n_words

('eng', 2803)

In [11]:
len(pairs)

10599

In [12]:
random.choice(pairs)

['nous partons ce soir .', 'we re leaving tonight .']

In [13]:
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensor_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensors_from_pairs(pair):
    input_tensor = tensor_from_sentence(input_lang, pair[0])
    target_tensor = tensor_from_sentence(output_lang, pair[1])
    return input_tensor, target_tensor

## The Seq2Seq model

A seq2seq network, also known as an Encoder Decoder network, consists of twi RNNs called the encoder and decoder. The encoder reads an input sequence and outputs a single vector. The decoder reads that vector to produce an output sequence.

<img src="../figures/encoder-decoder.png">

### The encoder

The encoder of a seq2seq network is an RNN that outputs some value for every word in the input sentence, and a hidden state. It encodes the input in an embedding before passing the embedding to the RNN.

<img src="../figures/encoder-seq2seq.png">

In [14]:
class EncoderRNN(nn.Module):
    def __init__(self, in_vocab_size, hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(in_vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
        Sizes = namedtuple('Size',['hidden','in_vocab'])
        self.sizes = Sizes(hidden=hidden_size, in_vocab=in_vocab_size)
    
    def forward(self, x_in, hidden):        
        sequence_length, batch_size = x_in.size()
        
        embedded = self.embedding(x_in)  #.view(1, 1, -1)          
                
        assert embedded.size() == (sequence_length, batch_size, self.sizes.hidden)
        
        output = embedded
        output, hidden = self.gru(output, hidden)
        
        assert output.size() == (sequence_length, batch_size, self.sizes.hidden)
        assert hidden.size() == (1, batch_size, self.sizes.hidden)
                
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros((1, 1, self.sizes.hidden), device=device)

In [15]:
n_hidden = 128
encoder = EncoderRNN(input_lang.n_words, n_hidden)
a_sentence = tensor_from_sentence(input_lang, pairs[0][0])

h0 = encoder.init_hidden()
output, h_n = encoder(a_sentence, h0)
output.size(), h_n.size()

(torch.Size([5, 1, 128]), torch.Size([1, 1, 128]))

## The decoder

The decoder is another RNN that takes the encoder output vector and outputs a sequence of words to create a translation.

<img src="../figures/decoder-seq2seq.png">

The encoder's final hidden state is given to the decoder as the first hidden state. This is called a **context vector**.

In [16]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, out_vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(out_vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, out_vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        Sizes = namedtuple('Size',['hidden','out_vocab'])
        self.sizes = Sizes(hidden=hidden_size, out_vocab=out_vocab_size)
    
    def forward(self, x_in, hidden):
        sequence_length, batch_size = x_in.size()
        
        output = self.embedding(x_in)
        assert output.size() == (sequence_length, batch_size, self.sizes.hidden)
        
        output = F.relu(output)
        assert output.size() == (sequence_length, batch_size, self.sizes.hidden)
        
        output, hidden = self.gru(output, hidden)
        assert output.size() == (sequence_length, batch_size, self.sizes.hidden)
        
        output = self.softmax(self.out(torch.squeeze(output, 1)))

        assert output.size() == (sequence_length, self.sizes.out_vocab)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.sizes.hidden, device=device)
        

In [17]:
n_hidden = 128
decoder = DecoderRNN(output_lang.n_words, n_hidden)
a_sentence = tensor_from_sentence(output_lang, pairs[0][1])

h0 = decoder.init_hidden()
output, h_n = decoder(a_sentence, h0)
output.size(), h_n.size()

(torch.Size([4, 128]), torch.Size([1, 1, 2803]))

## Train the seq2seq model

As an exercise to do once I understand how to hook these up.

## Attention decoder

<img src="../figures/attn-diag-seq2seq.png">

<img src="../figures/attn-seq2seq.png">

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, out_vocab_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super().__init__()
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(out_vocab_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, max_length)  # todo: why x2?
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size) # todo: why x2?
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, out_vocab_size)                
        
        Sizes = namedtuple('Size', ['hidden','out_vocab', 'max_seq_length'])
        self.sizes = Sizes(hidden=hidden_size, out_vocab=out_vocab_size, max_seq_length=max_length)
        
    def forward(self, x_in, hidden, encoder_outputs):
        sequence_length, batch_size = x_in.size()
        
        embedded = self.embedding(x_in)
        assert embedded.size() == (sequence_length, batch_size, self.sizes.hidden)
        
        embedded = self.dropout(embedded)
        
        # todo: to understand this!
        attn_weights = F.softmax(
            self.attn(torch.cat((torch.squeeze(embedded, 1), hidden[0]), dim=1)), dim=1)
        assert attn_weights.size() == (sequence_length, 2*self.sizes.hidden)
        