# SEQUENCE TO SEQUENCE NETWORK

In [1]:
import os
import time
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
import torchtext
from torchtext import data
from torchtext.data import Field, BucketIterator, TabularDataset
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
pd.options.display.max_colwidth = 300

In [3]:
europarl_en = open('data/europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open('data/europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [4]:
print(europarl_en[:2])
print()
print(europarl_fr[:2])

['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.']

['Reprise de la session', 'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.']


In [5]:
en = spacy.load('en')
fr = spacy.load('fr')

In [6]:
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]

def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]

In [7]:
EN_TEXT = Field(tokenize=tokenize_en, pad_token="<pad>")
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>", pad_token="<pad>")

In [8]:
raw_data = {'English' : [line for line in europarl_en], 'French': [line for line in europarl_fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])

In [9]:
df.head()

Unnamed: 0,English,French
0,Resumption of the session,Reprise de la session
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Comme vous avez pu le constater, le grand ""bogue de l'an 2000"" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles."
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.","Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session."
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.","En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés."


In [10]:
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')

In [11]:
df.head()

Unnamed: 0,English,French,eng_len,fr_len
0,Resumption of the session,Reprise de la session,3,3
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.,37,32
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Comme vous avez pu le constater, le grand ""bogue de l'an 2000"" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.",30,36
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.","Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.",18,18
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.","En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.",39,37


In [12]:
# create train and validation set 
train, val = train_test_split(df.drop(["eng_len","fr_len"], axis=1).head(1000), test_size=0.1, random_state=12345)
train.to_csv("data/train.csv", index=False)
val.to_csv("data/val.csv", index=False)

In [13]:
# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]
train,val = torchtext.data.TabularDataset.splits(path='data/', train='train.csv', 
                                       validation='val.csv', format='csv', 
                                       fields=data_fields)

In [14]:
min_freq = 2
FR_TEXT.build_vocab(train, val, min_freq=min_freq)
EN_TEXT.build_vocab(train, val, min_freq=min_freq)

In [15]:
seed = 12345
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [16]:
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.French) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [17]:
hidden_size = 512
embed_size = 256
lr = 0.0001
grad_clip = 10
epochs = 100
batch_size = 32

In [18]:
train_iter = MyIterator(train, batch_size=batch_size, device=0,
                        repeat=False, sort_key= lambda x:
                        (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=True)

In [19]:
val_iter = MyIterator(val, batch_size=batch_size, device=0,
                        repeat=False, sort_key= lambda x:
                        (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=False)

## Encoder

In [20]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, n_layers=1, dropout=0.2):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.embed = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers,
                          dropout=dropout, bidirectional=True)

    def forward(self, src, hidden=None):
        embedded = self.embed(src)
        outputs, hidden = self.gru(embedded, hidden)
        # sum bidirectional outputs
        outputs = (outputs[:, :, :self.hidden_size] +
                   outputs[:, :, self.hidden_size:])        
        return outputs, hidden

## Attention

In [21]:
class Attention(nn.Module):
    def __init__(self, hidden_size, method):
        super(Attention, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.rand(hidden_size))        

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(0)
        h = hidden.repeat(seq_len, 1, 1).transpose(0, 1) # [B*seq_len*H]
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*seq_len*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1) # [B*1*seq_len]
                            
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        
        elif self.method == 'concat':                    
            energy = F.tanh(self.attn(torch.cat([hidden, encoder_output], 2))) # [B*seq_len*2H]->[B*seq_len*H]
            energy = energy.transpose(1, 2)  # [B*H*seq_len]
            v = self.v.repeat(encoder_output.size(0), 1).unsqueeze(1)  # [B*1*H]
            energy = torch.bmm(v, energy)  # [B*1*seq_len]
            return energy.squeeze(1)  # [B*seq_len]        

## Decoder

In [22]:
class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, method, n_layers=1):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.method = method
        self.n_layers = n_layers

        self.embed = nn.Embedding(output_size, embed_size)
        self.attention = Attention(hidden_size, method)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size,n_layers)
        self.linear = nn.Linear(hidden_size * 2, hidden_size)
        self.linear_out = nn.Linear(hidden_size, output_size)

    def forward(self, decoder_input, encoder_hidden, encoder_outputs):
        
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attention(encoder_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
        context = context.transpose(0, 1)  # (1,B,N)
        
        # Get the embedding of the current input word (previous timestep word output from the decoder)
        embedded = self.embed(decoder_input).unsqueeze(0)  # (1,B,N)        
        
        # Combine embedded input word and attended context, run through RNN
        decoder_input = torch.cat([embedded, context], 2)
        output, hidden = self.gru(decoder_input, encoder_hidden)
        output = output.squeeze(0)  # (1,B,N) -> (B,N)
        context = context.squeeze(0)
        attn_hidden_state = F.tanh(self.linear(torch.cat([output, context], 1)))        
        output = F.log_softmax(self.linear_out(attn_hidden_state), dim=1)
        return output, hidden, attn_weights

### Encoder/Decoder

In [80]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.output_size
        outputs = torch.zeros(max_len, batch_size, vocab_size).cuda()

        encoder_output, hidden = self.encoder(src)
        hidden = hidden[:self.decoder.n_layers]
        output = trg.data[0, :]  # sos
        for t in range(1, max_len):
            output, hidden, attn_weights = self.decoder(
                    output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1]
            output = trg.data[t] if is_teacher else top1.cuda()
        return outputs

### Training Loop

In [81]:
def train(data_iter, model, optimizer, vocab_size, grad_clip):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    pad = EN_TEXT.vocab.stoi['<pad>']
    for i, batch in enumerate(data_iter):
        src = batch.English
        trg = batch.French
        #src = torch.FloatTensor(src)
        #trg = torch.FloatTensor(trg)
        src, trg = src.cuda(), trg.cuda()        
        optimizer.zero_grad()
        out = model(src, trg)
        loss = F.nll_loss(out[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        loss.backward()
        clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss
        ntokens = (trg[:, 1:] != pad).sum().cpu().item()
        total_tokens += ntokens                
        
        tokens += ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

### Training the system

In [82]:
hidden_size = 512
embed_size = 256
lr = 0.0001
grad_clip = 10
epochs = 100
batch_size = 32
src_vocab_size = len(EN_TEXT.vocab)
trg_vocab_size = len(FR_TEXT.vocab)


encoder = Encoder(src_vocab_size, embed_size, hidden_size, n_layers=2, dropout=0.2)
decoder = Decoder(embed_size, hidden_size, trg_vocab_size, "concat", n_layers=1)
model = Seq2Seq(encoder, decoder).cuda()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [83]:
for epoch in range(10):
    model.train()
    train(train_iter, model, optimizer, trg_vocab_size, grad_clip)
    
    model.eval()
    loss = train(val_iter, model, optimizer, trg_vocab_size, grad_clip)
    print(loss)

Epoch Step: 1 Loss: 0.333090 Tokens per Sec: 252.163103
Epoch Step: 51 Loss: 0.273615 Tokens per Sec: 289.334309
Epoch Step: 101 Loss: 0.239273 Tokens per Sec: 280.089986
Epoch Step: 151 Loss: 0.178753 Tokens per Sec: 272.433656
Epoch Step: 201 Loss: 0.244191 Tokens per Sec: 280.130982
Epoch Step: 251 Loss: 0.234778 Tokens per Sec: 275.392779
Epoch Step: 301 Loss: 0.135119 Tokens per Sec: 284.841811
Epoch Step: 351 Loss: 0.129239 Tokens per Sec: 288.012579
Epoch Step: 401 Loss: 0.074798 Tokens per Sec: 282.449520
Epoch Step: 451 Loss: 0.195032 Tokens per Sec: 273.567843
Epoch Step: 501 Loss: 0.093449 Tokens per Sec: 291.157303
Epoch Step: 551 Loss: 0.186416 Tokens per Sec: 282.007829
Epoch Step: 601 Loss: 0.077658 Tokens per Sec: 280.029900
Epoch Step: 651 Loss: 0.242201 Tokens per Sec: 282.394647
Epoch Step: 701 Loss: 0.240077 Tokens per Sec: 288.684005
Epoch Step: 751 Loss: 0.187683 Tokens per Sec: 290.066902
Epoch Step: 801 Loss: 0.198470 Tokens per Sec: 294.163255


  return Variable(arr, volatile=not train)


RuntimeError: backward_input can only be called in training mode