<a href="https://colab.research.google.com/github/ldselvera/Transformer-NMT/blob/main/Transformer_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Libraries installations in case they are missing

In [None]:
#Install torchtext version 0.6.0 for Bleu metrics
#Otherwise, from torchtext.data.metrics import bleu_score will create an error

!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 3.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 7.1MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.94 torchtext-0.6.0


In [None]:
# To install spacy languages:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 661kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=9963e8a85ff7948541fb4a5ae68d7fcc9b4a06755b68144ad739b5b9c65d5564
  Stored in directory: /tmp/pip-ephem-wheel-cache-d9jk2_e_/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [None]:
import torch
import spacy
import sys
import math
import time
import torch.nn as nn
import torch.optim as optim
from torchtext import datasets
from torchtext.data import Field, BucketIterator

##Detect device, for faster training use GPU, for instance with Google Colab.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


##BLEU (Bilingual Evaluation Understudy) compares the machine-written translation to one or several human-written translations

In [None]:
def bleu(data, model, german, english, device, max_length=50, trans=trans):
    targets = []
    preds = []

    for instance in data:
        #get source and target sentences
        src = vars(instance)["src"]
        trg = vars(instance)["trg"]

        #model translation
        prediction = translate_sentence(model, src, german, english, device,max_length=50, trans=trans)
        
        #remove <eos> token
        prediction = prediction[:-1]  

        #store prediction and actual translation for scoring purposes
        targets.append([trg])
        preds.append(prediction)

    return bleu_score(preds, targets)

##Save or load model

In [None]:
#Save model to pth.tar file
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("Saving checkpoint")
    torch.save(state, filename)

In [None]:
#Load model to pth.tar file
def load_checkpoint(checkpoint, model, optimizer):
    print("Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

##Tokenize english and german

In [None]:
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

##Model class

In [None]:
class Transformer(nn.Module):
    def __init__(self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder, num_decoder, feedforward, dropout, max_len, device,):
        super(Transformer, self).__init__()

        #embded input sentence and positional encoding
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        
        #embded output sentence and positional encoding
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device

        #initialize torch's Transformer
        self.transformer = nn.Transformer(embedding_size, num_heads, num_encoder, num_decoder, feedforward, dropout,)
        #final linear transformation for outputs
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        #dropout for regularization
        self.dropout = nn.Dropout(dropout)
        #padding for input index
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        #masking to avoid model looking ahead
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_length, N = src.shape
        trg_length, N = trg.shape

        #get positional encoding
        src_positions = (torch.arange(0, src_length).unsqueeze(1).expand(src_length, N).to(self.device))
        trg_positions = (torch.arange(0, trg_length).unsqueeze(1).expand(trg_length, N).to(self.device))

        #get embeddings for input  and output with drop
        embed_src = self.dropout((self.src_word_embedding(src) + self.src_position_embedding(src_positions)))
        embed_trg = self.dropout((self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions)))

        #apply mask
        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_length).to(self.device)

        #input data to model
        out = self.transformer(embed_src, embed_trg, src_key_padding_mask=src_padding_mask, tgt_mask=trg_mask,)
        
        #obtain final predictions from last layer previously defined
        out = self.fc_out(out)
        return out

##To translate from English to German, run "en_de"
##To translate from  German to English, run "de_de", by uncommenting "trasn = "de_en"

In [None]:
#Translate from english to german
trans = "en_de"

#Translate from german to english
# trans = "de_en"

##Data preprocessing:
###Tokenize each of the sentences in the Translation Dataset based on the tokenizer defined in the Field

In [None]:
#Load english and german language from Spacy
spacy_eng = spacy.load("en")
spacy_ger = spacy.load("de")

#Define datatype with instruction to covert to tensor
#Set tokenization function, token that will be preprended,
#token that will be appended, and set sentence to all lowercase
english = Field(tokenize=tokenize_eng, init_token="<sos>", eos_token="<eos>", lower=True, )
german = Field(tokenize=tokenize_ger, init_token="<sos>", eos_token="<eos>", lower=True)

#Create dataset objects for splits of the Multi30k dataset
#exts sets extension path of language
#fields contains the fields that wil be used for data in each language (from previous lines)
if trans == "en_de":
  train_data, valid_data, test_data = datasets.Multi30k.splits(exts = (".en", ".de"), fields=(english, german))
elif trans == "de_en":
  train_data, valid_data, test_data = datasets.Multi30k.splits(exts = (".de", ".en"), fields=(german, english))
else:
  print("Please go to previous cell and choose between en_de or de_en")


###The build_vocab method now allows us to create the vocabulary associated with each language

In [None]:
#Build the vocabulary so we can convert tokens/words into integer
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

#Get size of vocabulary
if trans == "en_de":
  src_vocab_size = len(english.vocab)
  trg_vocab_size= len(german.vocab)
elif trans == "de_en":
  src_vocab_size = len(german.vocab)
  trg_vocab_size = len(english.vocab)
else:
  print("Please go to previous cell and choose between en_de or de_en")  

###SRC.vocab.stoi is now a dictionary with the tokens in the vocabulary as keys and their corresponding indices as values
###SRC.vocab.itos is the same dictionary with the keys and values swapped

##Split data

###Torchtext feature BucketIterator takes a TranslationDataset as its first argument. It defines an iterator that batches examples of similar lengths together. Minimizes amount of padding needed while producing freshly shuffled batches for each new epoch. 

In [None]:
batch_size = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device,)

##Model hyperparameters

In [None]:
#Input size into the model
embedding_size = 512

#Number of multihead attentions
num_heads = 8

#Number of encoders and decoders
num_encoder = 3
num_decoder = 3
dropout = 0.10

#Max lenght of each sentence
max_len = 100
feedforward = 4
learning_rate = 3e-4

if trans == "en_de":
  src_pad_idx = german.vocab.stoi["<pad>"]
elif trans == "de_en":
  src_pad_idx = english.vocab.stoi["<pad>"]

#Create model

In [None]:
#Initialize model with model hyperparameters
model = Transformer(embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder, num_decoder, feedforward, dropout, max_len, device,).to(device)
#Set Adam optimizer with learning 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

if trans == "en_de":
  pad_idx = german.vocab.stoi["<pad>"]
elif trans == "de_en":
  pad_idx = english.vocab.stoi["<pad>"]

#tell the nn.CrossEntropyLoss function to ignore the indices where the target is simply padding
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,671,687 trainable parameters


##Inference

In [None]:
def translate_sentence(model, sentence, german, english, device, max_length, trans):
    # Load tokenizer
    if trans == "en_de":
      spacy_eng = spacy.load("en")
    elif trans == "de_en":
      spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        if trans == "en_de":
          tokens = [token.text.lower() for token in spacy_eng(sentence)]
        elif trans == "de_en":
          tokens = [token.text.lower() for token in spacy_ger(sentence)]
        # tokens = [token.text.lower() for token in spacy_fr(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    if trans == "en_de":
      tokens.insert(0, english.init_token)
      tokens.append(english.eos_token)
      # Go through each german token and convert to an index
      text_to_indices = [english.vocab.stoi[token] for token in tokens]
    elif trans == "de_en":
      tokens.insert(0, german.init_token)
      tokens.append(german.eos_token)
      # Go through each german token and convert to an index
      text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    if trans == "en_de":
      outputs = [german.vocab.stoi["<sos>"]]
    elif trans == "de_en":
      outputs = [english.vocab.stoi["<sos>"]]

    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if trans == "en_de":
          if best_guess == german.vocab.stoi["<eos>"]:
            break
        elif trans == "de_en":
          if best_guess == english.vocab.stoi["<eos>"]:
            break

    if trans == "en_de":
      translated_sentence = [german.vocab.itos[idx] for idx in outputs]
    elif trans == "de_en":
      translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token        
    return translated_sentence[1:]

##Load model or save model 
##IMPORTANT: Set the path to the model file (which was provided to you)

In [None]:
load_model = True
save_model = False

#Modify this paths according
if load_model:
    if trans == "en_de":
      load_checkpoint(torch.load("en_de.pth.tar"), model, optimizer)
    elif trans == "de_en":
      load_checkpoint(torch.load("de_en.pth.tar"), model, optimizer)

Loading checkpoint


##Record time

In [None]:
def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

##Training

In [None]:
def train(mode: nn.Module, iterator: BucketIterator, optimizer: optim.Optimizer, criterio: nn.Module, clip:float):
      model.train()
      epoch_loss = 0
      losses = []

      #ButcketIterators can be called just like DataLoader
      for batch_idx, batch in enumerate(iterator):
          #Get input and targets and get to cuda
          #Each batch then has src and trg attributes
          inp_data = batch.src.to(device)
          target = batch.trg.to(device)

          # Forward propagation
          output = model(inp_data, target[:-1])

          output = output.reshape(-1, output.shape[2])
          target = target[1:].reshape(-1)

          optimizer.zero_grad()

          loss = criterion(output, target)
          losses.append(loss.item())

          # Back propagation
          loss.backward()

          # Clip to avoid exploding gradient issues
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

          # Gradient descent step
          optimizer.step()

          epoch_loss += loss.item()

      mean_loss = sum(losses) / len(losses)

      return epoch_loss / len(iterator)


##Testing

In [None]:
def evaluate(model: nn.Module, iterator: BucketIterator, criterion: nn.Module):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for batch_idx, batch in enumerate(iterator):

            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            output = model(inp_data, target[:-1])
            # output = model(src, trg, 0) #turn off teacher forcing

            output = output.reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)            

            loss = criterion(output, target)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

##Choose number of epochs:
###For 5 epochs it takes ~2-minutes
###For 10 epochs it takes ~4-minutes
###For 50 epochs it takes ~20-minutes with GPU
###For 100 epochs it takes ~1-hour with GPU

In [None]:
num_epochs = 5

In [None]:
CLIP = 1
start_time = time.time()
for epoch in range(num_epochs):

    if save_model:
      checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(),}
      save_checkpoint(checkpoint)

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if num_epochs < 11:
      print(f'Epoch: {epoch+1:02}')
      print(f'\tTrain Loss: {train_loss:.3f}')
      print(f'\t Val. Loss: {valid_loss:.3f}')
    else:
      if(epoch % 5 == 0):
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f}')

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)

print(f'Training Time: {epoch_mins}m {epoch_secs}s')

test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f}')

# Specify a path
PATH = trans + "_model.pt"

# Save
torch.save(model, PATH)

Epoch: 01
	Train Loss: 0.073
	 Val. Loss: 2.404
Epoch: 02
	Train Loss: 0.073
	 Val. Loss: 2.422
Epoch: 03
	Train Loss: 0.071
	 Val. Loss: 2.444
Epoch: 04
	Train Loss: 0.071
	 Val. Loss: 2.433
Epoch: 05
	Train Loss: 0.071
	 Val. Loss: 2.430
Training Time: 1m 31s
| Test Loss: 2.599


In [None]:
num_epochs = 50

In [None]:
CLIP = 1
start_time = time.time()
for epoch in range(num_epochs):

    if save_model:
      checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(),}
      save_checkpoint(checkpoint)

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if(epoch % 5 == 0):
      print(f'Epoch: {epoch+1:02}')
      print(f'\tTrain Loss: {train_loss:.3f}')
      print(f'\t Val. Loss: {valid_loss:.3f}')

end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)

print(f'Training Time: {epoch_mins}m {epoch_secs}s')

test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f}')

# Specify a path
PATH = trans + "_model.pt"

# Save
torch.save(model, PATH)

Epoch: 01
	Train Loss: 0.068
	 Val. Loss: 2.465
Epoch: 11
	Train Loss: 0.063
	 Val. Loss: 2.536
Epoch: 21
	Train Loss: 0.058
	 Val. Loss: 2.600
Epoch: 31
	Train Loss: 0.054
	 Val. Loss: 2.654
Epoch: 41
	Train Loss: 0.051
	 Val. Loss: 2.704
Training Time: 23m 43s
| Test Loss: 2.933


##Translation performance metric BLEU
## WARNING: Execute only if you have installed torchtext 0.6.0

In [None]:
from torchtext.data.metrics import bleu_score

#set model to evaluation mode
model.eval()

# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device, max_length=50, trans=trans)
print(f"Bleu score %.2f" % (score * 100))

##Sample runs: English to German

###Only execute if current translation is set to "en_de"

In [None]:
#check what current translation is set to
trans

'en_de'

In [None]:
sentence = vars(test_data[0])["src"]
translated_sentence = translate_sentence( model, sentence, german, english, device, max_length=50, trans=trans)
print("English: ", sentence)
print("Model translation: ", translated_sentence)
print("Actual translation: ", vars(test_data[0])["trg"])

English:  ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']
Model translation:  ['ein', 'mann', 'mit', 'orangefarbener', 'mütze', 'schaut', 'irgendetwas', 'an', '.', '<eos>']
Actual translation:  ['ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.']


In [None]:
sentence = vars(test_data[1])["src"]
translated_sentence = translate_sentence( model, sentence, german, english, device, max_length=50, trans=trans)
print("English: ", sentence)
print("Model translation: ", translated_sentence)
print("Actual translation: ", vars(test_data[1])["trg"])

English:  ['a', 'boston', 'terrier', 'is', 'running', 'on', 'lush', 'green', 'grass', 'in', 'front', 'of', 'a', 'white', 'fence', '.']
Model translation:  ['ein', 'fan', 'rennt', 'auf', 'einer', '<unk>', 'grünen', 'wiese', 'vor', 'einem', 'weißen', 'zaun', '.', '<eos>']
Actual translation:  ['ein', 'boston', 'terrier', 'läuft', 'über', 'saftig-grünes', 'gras', 'vor', 'einem', 'weißen', 'zaun', '.']


##Sample runs: German to English

###Only execute if current translation is set to "de_en"

In [None]:
#check what current translation is set to
trans

'en_de'

In [None]:
sentence = vars(test_data[0])["src"]
translated_sentence = translate_sentence( model, sentence, german, english, device, max_length=50, trans=trans)
print("German: ", sentence)
print("Model translation: ", translated_sentence)
print("Actual translation: ", vars(test_data[0])["trg"])

German:  ['ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.']
Model translation:  ['a', 'man', 'in', 'an', 'orange', 'hat', 'welding', 'something', '.', '<eos>']
Actual translation:  ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']


In [None]:
sentence = vars(test_data[1])["src"]
translated_sentence = translate_sentence( model, sentence, german, english, device, max_length=50, trans=trans)
print("German: ", sentence)
print("Model translation: ", translated_sentence)
print("Actual translation: ", vars(test_data[1])["trg"])

German:  ['ein', 'boston', 'terrier', 'läuft', 'über', 'saftig-grünes', 'gras', 'vor', 'einem', 'weißen', 'zaun', '.']
Model translation:  ['a', 'boston', '<unk>', 'terrier', 'runs', 'over', 'grass', 'in', 'front', 'of', 'a', 'white', 'fence', '.', '<eos>']
Actual translation:  ['a', 'boston', 'terrier', 'is', 'running', 'on', 'lush', 'green', 'grass', 'in', 'front', 'of', 'a', 'white', 'fence', '.']


In [None]:
sentence = "ein pferd geht unter einer brücke neben einem boot."
translated_sentence = translate_sentence( model, sentence, german, english, device, max_length=50, trans=trans)
print("German: ", sentence)
print("Model translation: ", translated_sentence)

German:  ein pferd geht unter einer brücke neben einem boot.
Model translation:  ['a', 'horse', 'is', 'walking', 'beside', 'a', 'boat', 'under', 'a', 'bridge', '.', '<eos>']
