In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
import torchtext
from torchtext import data
from torchtext.data import Field, BucketIterator, TabularDataset
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
pd.options.display.max_colwidth = 300

In [3]:
en = spacy.load('en')
fr = spacy.load('fr')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
europarl_en = open('data/europarl-v7.fr-en.en', encoding='utf-8').read().split('\n')
europarl_fr = open('data/europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')

In [5]:
europarl_en[:2]

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.']

In [6]:
europarl_fr[:2]

['Reprise de la session',
 'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.']

In [7]:
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]

def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]

In [8]:
EN_TEXT = Field(tokenize=tokenize_en, init_token = "<sos>", eos_token = "<eos>")
FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")

In [9]:
raw_data = {'English' : [line for line in europarl_en], 'French': [line for line in europarl_fr]}
df = pd.DataFrame(raw_data, columns=["English", "French"])

In [10]:
df.head()

Unnamed: 0,English,French
0,Resumption of the session,Reprise de la session
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Comme vous avez pu le constater, le grand ""bogue de l'an 2000"" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles."
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.","Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session."
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.","En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés."


In [11]:
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['fr_len'] = df['French'].str.count(' ')
df = df.query('fr_len < 80 & eng_len < 80')
df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')

In [12]:
df.head()

Unnamed: 0,English,French,eng_len,fr_len
0,Resumption of the session,Reprise de la session,3,3
1,"I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.",Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.,37,32
2,"Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.","Comme vous avez pu le constater, le grand ""bogue de l'an 2000"" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.",30,36
3,"You have requested a debate on this subject in the course of the next few days, during this part-session.","Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.",18,18
4,"In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.","En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.",39,37


In [13]:
# create train and validation set 
train, val = train_test_split(df.drop(["eng_len","fr_len"], axis=1).head(100), test_size=0.1, random_state=12345)
train.to_csv("data/train.csv", index=False)
val.to_csv("data/val.csv", index=False)

In [14]:
# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]
train,val = torchtext.data.TabularDataset.splits(path='data/', train='train.csv', 
                                       validation='val.csv', format='csv', 
                                       fields=data_fields)

In [15]:
FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

In [16]:
print(EN_TEXT.vocab.stoi['the'])
print(EN_TEXT.vocab.itos[2])

4
<sos>


In [17]:
seed = 12345
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [18]:
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.English))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.French) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

In [19]:
train_iter = MyIterator(train, batch_size=2, device=0,
                        repeat=False, sort_key= lambda x:
                        (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=True)

In [20]:
val_iter = MyIterator(val, batch_size=2, device=0,
                        repeat=False, sort_key= lambda x:
                        (len(x.English), len(x.French)),
                        batch_size_fn=batch_size_fn, train=False)

train_iter = BucketIterator(train, batch_size=128, \
sort_key=lambda x:(len(x.English), len(x.French)), shuffle=True)

val_iter = BucketIterator(train, batch_size=128, \
sort_key=lambda x:(len(x.English), len(x.French)))

In [30]:
a = torch.Tensor([1, 5, 10]).unsqueeze(0)
print(a.shape)
a

torch.Size([1, 3])


tensor([[  1.,   5.,  10.]])

In [None]:
torch.nn.GRU()

In [38]:
layer = torch.nn.Linear(3,1)

In [39]:
layer(a)

tensor([[-4.2819]])

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim)        
        self.gru = nn.GRU(emb_dim, hid_dim)
        
    def forward(self, src):
        
        #src = [batch size, sent len]
        embedded = self.embedding(src) # shape ---> [sent len, batch size, emb dim]        
        outputs, hidden = self.gru(embedded)
        print("Printing output [sent len, batch size, hid dim ]")
        print(outputs)
        print("Printing hidden [n directions, batch size, hid dim]")
        print(hidden)
        #outputs = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        return outputs, hidden

In [22]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.gru = nn.GRU(emb_dim, hid_dim)        
        self.out = nn.Linear(hid_dim, output_dim)        
        
    def forward(self, input, hidden):
        
        #input = [batch size],  #hidden = [n layers * n directions, batch size, hid dim]        
        input = input.unsqueeze(0) # ---> shape [1, batch size]
        
        embedded = self.embedding(input) # ---> shape [1, batch size, emb dim]
            
        output, hidden = self.gru(embedded, hidden)
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = output = [1, batch size, hid dim]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        prediction = self.out(output.squeeze(0)) # shape ---> [batch size, output dim]
        
        return prediction, hidden

In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!"
        #assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src AND trg = [sent len, batch size]
        #if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        enc_output, hidden = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens ---> trg[0,:] ---> trg shape is (1, batch size), therefore, we 
        # are getting the first word for all batches, which is represented by the <sos> token
        input = trg[0,:]
        
        for t in range(1, max_len):
            
            pred, hidden = self.decoder(input, hidden)
            outputs[t] = pred
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = pred.max(1)[1] # ---> get indexes of highest probability for each batch element (sentence) -> shape [batch size]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

In [24]:
INPUT_DIM = len(EN_TEXT.vocab)
OUTPUT_DIM = len(FR_TEXT.vocab)
ENC_EMB_DIM = 6
DEC_EMB_DIM = 6
HID_DIM = 3

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [25]:
optimizer = optim.Adam(model.parameters())

In [26]:
pad_idx = FR_TEXT.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [27]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):        
        src = batch.English
        trg = batch.French
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # The loss function only works on 2d inputs with 1d targets we need to flatten each of them with .view
        #trg = [sent len, batch size] ---> trg = [(sent len - 1) * batch size]
        #output = [sent len, batch size, output dim] ---> output = [(sent len - 1) * batch size, output dim]        
        loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
        
        loss.backward()        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss

In [28]:
def evaluate(model, iterator, criterion):
    
    model.eval()    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            src = batch.English
            trg = batch.French
            
            output = model(src, trg, 0) #turn off teacher forcing
            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
            epoch_loss += loss.item()
        
    return epoch_loss

In [30]:
N_EPOCHS = 25
CLIP = 10
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'tut2_model.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iter, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |')

Printing output [sent len, batch size, hid dim ]
tensor([[[-0.4545,  0.0378, -0.2322]],

        [[-0.4312, -0.8369, -0.2170]],

        [[-0.4015, -0.2130,  0.0116]],

        [[-0.4069, -0.3455,  0.1777]],

        [[-0.4382, -0.8469,  0.1333]],

        [[-0.3464, -0.1845,  0.1933]],

        [[-0.2087, -0.2794,  0.2760]],

        [[-0.5654,  0.0503,  0.1576]],

        [[-0.6577, -0.2223, -0.2886]],

        [[-0.6520, -0.1277, -0.3060]],

        [[-0.6827,  0.0527, -0.4788]],

        [[-0.7434,  0.1399, -0.5923]],

        [[-0.1906, -0.2596, -0.1383]],

        [[-0.0751, -0.1763,  0.2885]],

        [[ 0.0621, -0.1654,  0.5018]],

        [[-0.0240, -0.1134, -0.2499]],

        [[-0.2217, -0.3353, -0.2937]],

        [[-0.5268, -0.1196, -0.0073]],

        [[-0.6189,  0.3964, -0.1435]],

        [[-0.7463,  0.3994, -0.0619]],

        [[ 0.2396, -0.0295,  0.2441]],

        [[-0.1125, -0.2082,  0.3476]],

        [[-0.0020, -0.1158,  0.5304]],

        [[ 0.4116, -0.1917,  0.

ValueError: max() arg is an empty sequence

In [72]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} |')

RuntimeError: Error(s) in loading state_dict for Seq2Seq:
	While copying the parameter named "encoder.embedding.weight", whose dimensions in the model are torch.Size([3709, 6]) and whose dimensions in the checkpoint are torch.Size([3709, 256]).
	While copying the parameter named "encoder.gru.weight_ih_l0", whose dimensions in the model are torch.Size([9, 6]) and whose dimensions in the checkpoint are torch.Size([1536, 256]).
	While copying the parameter named "encoder.gru.weight_hh_l0", whose dimensions in the model are torch.Size([9, 3]) and whose dimensions in the checkpoint are torch.Size([1536, 512]).
	While copying the parameter named "encoder.gru.bias_ih_l0", whose dimensions in the model are torch.Size([9]) and whose dimensions in the checkpoint are torch.Size([1536]).
	While copying the parameter named "encoder.gru.bias_hh_l0", whose dimensions in the model are torch.Size([9]) and whose dimensions in the checkpoint are torch.Size([1536]).
	While copying the parameter named "decoder.embedding.weight", whose dimensions in the model are torch.Size([4420, 6]) and whose dimensions in the checkpoint are torch.Size([4420, 256]).
	While copying the parameter named "decoder.gru.weight_ih_l0", whose dimensions in the model are torch.Size([9, 6]) and whose dimensions in the checkpoint are torch.Size([1536, 256]).
	While copying the parameter named "decoder.gru.weight_hh_l0", whose dimensions in the model are torch.Size([9, 3]) and whose dimensions in the checkpoint are torch.Size([1536, 512]).
	While copying the parameter named "decoder.gru.bias_ih_l0", whose dimensions in the model are torch.Size([9]) and whose dimensions in the checkpoint are torch.Size([1536]).
	While copying the parameter named "decoder.gru.bias_hh_l0", whose dimensions in the model are torch.Size([9]) and whose dimensions in the checkpoint are torch.Size([1536]).
	While copying the parameter named "decoder.out.weight", whose dimensions in the model are torch.Size([4420, 3]) and whose dimensions in the checkpoint are torch.Size([4420, 512]).