In [1]:
import pandas as pd

In [549]:
portuguese_df_ted_talk = pd.read_csv('../data/pt_br_tedtalk.txt', sep='\t', header=None, names=['example_portuguese_sentence'])
portuguese_df_neulab = pd.read_csv('../data/pt_br_neulab.txt', sep='\t', header=None, names=['example_portuguese_sentence'])

In [550]:
portuguese_df_ted_talk.head()

Unnamed: 0,example_portuguese_sentence
0,"Em meados do século 16, os italianos ficavam e..."
1,"Porém, esse talento tinha um preço alto."
2,Para evitar que a voz desses cantores se quebr...
3,"Conhecidos como ""castrati"", a voz leve e angel..."
4,O interrompimento do desenvolvimento vocal pod...


In [551]:
portuguese_df_neulab.head()

Unnamed: 0,example_portuguese_sentence
0,Pensei em ler os meus poemas que tem relação c...
1,"Eu fiquei meio surpreso ao descobrir quantos, ..."
2,"O primeiro é dedicado a Spencer, e sua avó, qu..."
3,"Meu poema se chama """" Sujeira """"."
4,Minha avó está lavando minha boca com sabão; m...


In [552]:
# Load data with manual corrections
data_cleaned = pd.read_csv('../data/libras_dictionary.csv')
data_cleaned["source"] = "ines"

In [554]:
data_cleaned.head()

Unnamed: 0,word,subject,interpretation,example_portuguese_sentence,example_libras_sentence,grammar_class,word_origin,video_link,image_link,hand_image_link,source
0,A,NENHUM,Primeira letra do alfabeto da língua portugues...,Invente qualquer palavra que comece com a letr...,VOCÊ INVENTAR QUALQUER PALAVRA COMEÇAR A.,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,ines
1,ABACATE,FRUTA,"O fruto do abacateiro. Comestível, tem a polpa...",Você gosta de abacate com leite?,VOCÊ GOSTAR ABACATE LEITE JUNTO?,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,ines
2,ABACAXI,FRUTA,Fruta de casca grossa e áspera. Sua polpa pode...,"Hoje tomei suco de abacaxi, ele estava ácido.",HOJE S-U-C-O ABACAXI BEBER ÁCID@.,SUBSTANTIVO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,ines
3,ABAFAR,NENHUM,"Cobrir ou fechar, para manter o calor.","Se você quer abafar seu quarto, é melhor fecha...",S-I VOCÊ QUERER QUARTO SE@ ABAFAR A-R? MELHOR ...,VERBO,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,ines
4,ABAIXO,NENHUM,"Lugar, posição ou situação inferior, em relaçã...","Não é no primeiro apartamento abaixo, é no seg...",APARTAMENTO PRIMEIR@ NÃO SEGUND@ ABAIXO.,ADV.,Nacional,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,https://www.ines.gov.br/dicionario-de-libras/p...,ines


In [556]:
# Get the mean size of the sentences

print(f"Mean words: {data_cleaned['example_portuguese_sentence'].str.split().str.len().mean()}")
print(f"Mean characters: {data_cleaned['example_portuguese_sentence'].str.len().mean()}")

Mean words: 8.660837950138504
Mean characters: 47.44719529085872


In [557]:
# Keep only the sentences with less than 50 characters

portuguese_df_ted_talk = portuguese_df_ted_talk[(portuguese_df_ted_talk['example_portuguese_sentence'].str.len() <= 50) & (portuguese_df_ted_talk['example_portuguese_sentence'].str.split().str.len() >= 8)]
portuguese_df_ted_talk["source"] = "opus_nlpl"

In [558]:
# Keep only the sentences with less than 50 characters

portuguese_df_neulab = portuguese_df_neulab[(portuguese_df_neulab['example_portuguese_sentence'].str.len() <= 50) & (portuguese_df_neulab['example_portuguese_sentence'].str.split().str.len() >= 8)]
portuguese_df_neulab["source"] = "opus_nlpl"

In [575]:
data = pd.concat([
    data_cleaned[["example_portuguese_sentence", "source"]], 
    portuguese_df_neulab[["example_portuguese_sentence", "source"]],
    portuguese_df_ted_talk[["example_portuguese_sentence", "source"]]],
    ignore_index=True
)

In [576]:
data.head(100)

Unnamed: 0,example_portuguese_sentence,source
0,Invente qualquer palavra que comece com a letr...,ines
1,Você gosta de abacate com leite?,ines
2,"Hoje tomei suco de abacaxi, ele estava ácido.",ines
3,"Se você quer abafar seu quarto, é melhor fecha...",ines
4,"Não é no primeiro apartamento abaixo, é no seg...",ines
...,...,...
95,"Se o freio estiver com problema, é perigoso o ...",ines
96,Aquele homem tem muita acne no rosto.,ines
97,"A janela da minha casa é de aço, difícil de qu...",ines
98,Aquele homem açoitou o cavalo para que acelera...,ines


In [577]:
data.shape

(54581, 2)

In [562]:
import torch
import torch.nn as nn
import torch.optim as optmi

from torchtext.data import Field, BucketIterator, TabularDataset
from torch.utils.data import random_split, TensorDataset, DataLoader, Dataset
import numpy as np
import pandas as pd
import spacy
import random
from torchtext.data.metrics import bleu_score
from pprint import pprint
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
import spacy

# Seeding for reproducible results everytime
SEED = 777

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Data Preparation and Preprocessing

Let's see some of the process it can do,

* Train/ Valid/ Test Split: partition your data into a specified train/ valid/ test set.

* File Loading: load the text corpus of various formats (.txt,.json,.csv).

* Tokenization: breaking sentences into list of words.

* Vocab: Generate a list of vocabulary from the text corpus.

* Words to Integer Mapper: Map words into integer numbers for the entire corpus and vice versa.

* Word Vector: Convert a word from higher dimension to lower dimension (Word Embedding).

* Batching: Generate batches of sample.

In [563]:
# spacy portuguese tokenizer
nlp = spacy.load('pt_core_news_sm')

In [572]:
doc = nlp(clean_text(data['example_portuguese_sentence'][15]))

In [573]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

o o DET det
médico médico NOUN nsubj
me eu PRON obj
mandou mandar VERB ROOT
tirar tirar VERB xcomp
raio raio NOUN obj
x x VERB obj
do de o ADP case
abdômen Abdômen NOUN iobj
. . PUNCT punct


In [569]:
for token in doc:
    if token.pos_ == 'VERB'\
        or token.pos_ == 'NOUN' \
        or token.pos_ == 'ADJ' \
        or token.pos_ == 'ADV' \
        or token.pos_ == 'NUM' \
        or token.pos_ == 'PROPN' \
        or token.pos_ == 'PRON' \
        or token.pos_ == 'SCONJ':
        print(token.lemma_, token.pos_, token.dep_)


médico NOUN nsubj
eu PRON obj
mandar VERB ROOT
tirar VERB xcomp
raio NOUN obj
x VERB obj
Abdômen NOUN iobj


In [578]:
# Organize sentences in subject, verb, object

def get_subjects(doc):

    subjects = []
    for token in doc:
        if token.dep_ == 'nsubj':
            subjects.append(token.lemma_)
    return subjects

def get_verbs(doc):
    
    verbs = []
    for token in doc:
        if token.pos_ == 'VERB':
            verbs.append(token.lemma_)
    return verbs

def get_objects(doc):

    objects = []
    for token in doc:
        if token.dep_ == 'obj' or token.dep_ == 'dobj' or token.dep_ == 'iobj':
            objects.append(token.lemma_)
    return objects


def get_nouns(doc):

    nouns = []
    for token in doc:
        if token.pos_ == 'NOUN':
            nouns.append(token.lemma_)
    return nouns

def get_adjectives(doc):

    adjectives = []
    for token in doc:
        if token.pos_ == 'ADJ':
            adjectives.append(token.lemma_)
    return adjectives

def get_adverbs(doc):

    adverbs = []
    for token in doc:
        if token.pos_ == 'ADV':
            adverbs.append(token.lemma_)
    return adverbs

def get_numbers(doc):

    numbers = []
    for token in doc:
        if token.pos_ == 'NUM':
            numbers.append(token.lemma_)
    return numbers


def get_conjunctions(doc):

    conjunctions = []
    for token in doc:
        if token.pos_ == 'SCONJ':
            conjunctions.append(token.lemma_)
    return conjunctions

def get_pronouns(doc):

    pronouns = []
    for token in doc:
        if token.pos_ == 'PRON':
            pronouns.append(token.lemma_)
    return pronouns

def get_proper_nouns(doc):

    proper_nouns = []
    for token in doc:
        if token.pos_ == 'PROPN':
            proper_nouns.append(token.lemma_)
    return proper_nouns


def organize_sentence_svo(doc):
    sentence = []
    sentence.append(get_subjects(doc))
    sentence.append(get_verbs(doc))
    sentence.append(get_objects(doc))
    sentence.append(get_nouns(doc))
    sentence.append(get_adjectives(doc))
    sentence.append(get_adverbs(doc))
    sentence.append(get_numbers(doc))
    #sentence.append(get_conjunctions(doc))
    #sentence.append(get_pronouns(doc))
    #sentence.append(get_proper_nouns(doc))

    # remove empty lists

    sentence = [list(set(x)) for x in sentence if x != []]

    # Join the lists

    sentence = [item for sublist in sentence for item in sublist]

    # Rmove duplicate words and join the sentence, keeping the order

    sentence = ' '.join(list(dict.fromkeys(sentence)))

    return sentence

In [580]:
organize_sentence_svo(nlp(data['example_portuguese_sentence'][30]))

'aborrecer valer pena não'

In [581]:
# Apply the function to the dataset

data['example_libras_sentence_svo'] = data['example_portuguese_sentence'].apply(lambda x: organize_sentence_svo(nlp(x)))

In [582]:
data[["example_portuguese_sentence", "example_libras_sentence_svo"]].to_csv('../data/data_svo.csv', index=False)

In [583]:
import re

def clean_text(text):

    # Remove characters that are not letters
    text = re.sub(r"[^a-zA-ZÀ-ú.!?]+", ' ', text)
    
    # Remove extra spaces
    text = re.sub(r" +", ' ', text)

    # Remove extra spaces at the beginning and end of the sentence
    text = text.strip()

    # Convert to lower case
    text = text.lower()

    return text

In [584]:
def spacy_tokenizer(text):

    text = clean_text(text)

    return [tok.text for tok in nlp.tokenizer(text)]

In [585]:
libras_sentences_source = Field(
    tokenize = spacy_tokenizer,
    lower=True,
    init_token="<sos>",
    eos_token="<eos>",
)

portuguese_sentences_target = Field(
    tokenize = spacy_tokenizer,
    lower=True,
    init_token="<sos>",
    eos_token="<eos>",
)

In [586]:
dataset = TabularDataset(path='../data/data_svo.csv', format='csv', fields=[('trg', portuguese_sentences_target), ('src', libras_sentences_source)], skip_header=False)

In [587]:
train_data, val_data, test_data = dataset.split(split_ratio=[0.8, 0.1, 0.1], random_state=random.seed(SEED))

In [588]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(val_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 43666
Number of validation examples: 5458
Number of testing examples: 5458


In [589]:
print("Tain sentence example - Libras ", train_data.examples[3].src)
print("Tain sentence example - Portuguese ", train_data.examples[3].trg)

Tain sentence example - Libras  ['que', 'resposta', 'faltar']
Tain sentence example - Portuguese  ['e', 'a', 'resposta', 'é', 'o', 'que', 'está', 'faltando', '?']


In [590]:
libras_sentences_source.build_vocab(train_data, min_freq=2)
portuguese_sentences_target.build_vocab(train_data, min_freq=2)

In [591]:
print(f"Unique tokens in source (libras) vocabulary: {len(libras_sentences_source.vocab)}")
print(f"Unique tokens in target (portuguse) vocabulary: {len(portuguese_sentences_target.vocab)}")

Unique tokens in source (libras) vocabulary: 8504
Unique tokens in target (portuguse) vocabulary: 12778


In [592]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [593]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    shuffle=True
)

In [500]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

In [501]:
class Encoder(nn.Module):
    """
    Input :
        - source batch
    Layer : 
        source batch -> Embedding -> LSTM
    Output :
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    input_dim : int
        Input dimension, should equal to the source vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

    def forward(self, src_batch: torch.LongTensor):
        """

        Parameters
        ----------
        src_batch : 2d torch.LongTensor
            Batched tokenized source sentence of shape [sent len, batch size].

        Returns
        -------
        hidden, cell : 3d torch.LongTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        embedded = self.embedding(src_batch) # [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        return hidden, cell

In [502]:
class Decoder(nn.Module):
    """
    Input :
        - first token in the target batch
        - LSTM hidden state from the encoder
        - LSTM cell state from the encoder
    Layer :
        target batch -> Embedding -- 
                                   |
        encoder hidden state ------|--> LSTM -> Linear
                                   |
        encoder cell state   -------
        
    Output :
        - prediction
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    output : int
        Output dimension, should equal to the target vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hid_dim, output_dim)

    def forward(self, trg: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor):
        """

        Parameters
        ----------
        trg : 1d torch.LongTensor
            Batched tokenized source sentence of shape [batch size].
            
        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]

        Returns
        -------
        prediction : 2d torch.LongTensor
            For each token in the batch, the predicted target vobulary.
            Shape [batch size, output dim]

        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        # [1, batch size, emb dim], the 1 serves as sent len
        embedded = self.embedding(trg.unsqueeze(0))
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(outputs.squeeze(0))
        return prediction, hidden, cell

In [503]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            'Hidden dimensions of encoder and decoder must be equal!'
        assert encoder.n_layers == decoder.n_layers, \
            'Encoder and decoder must have equal number of layers!'

    def forward(self, src_batch: torch.LongTensor, trg_batch: torch.LongTensor,
                teacher_forcing_ratio: float=0.5):

        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden & cell state of the encoder is used as the decoder's initial hidden state
        hidden, cell = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

In [504]:
for batch in train_iterator:
    print(batch.src.shape)
    print(batch.trg.shape)

torch.Size([7, 32])
torch.Size([14, 32])
torch.Size([7, 32])
torch.Size([15, 32])
torch.Size([4, 32])
torch.Size([14, 32])
torch.Size([8, 32])
torch.Size([17, 32])
torch.Size([7, 32])
torch.Size([14, 32])
torch.Size([7, 32])
torch.Size([14, 32])
torch.Size([8, 32])
torch.Size([14, 32])
torch.Size([7, 32])
torch.Size([13, 32])
torch.Size([7, 32])
torch.Size([13, 32])
torch.Size([9, 32])
torch.Size([15, 32])
torch.Size([7, 32])
torch.Size([12, 32])
torch.Size([7, 32])
torch.Size([14, 32])
torch.Size([8, 32])
torch.Size([16, 32])
torch.Size([5, 32])
torch.Size([13, 32])
torch.Size([5, 32])
torch.Size([13, 32])
torch.Size([8, 32])
torch.Size([14, 32])
torch.Size([8, 32])
torch.Size([14, 32])
torch.Size([9, 32])
torch.Size([15, 32])
torch.Size([4, 32])
torch.Size([15, 32])
torch.Size([7, 32])
torch.Size([13, 32])
torch.Size([5, 32])
torch.Size([14, 32])
torch.Size([7, 32])
torch.Size([14, 32])
torch.Size([7, 32])
torch.Size([13, 32])
torch.Size([10, 32])
torch.Size([20, 32])
torch.Size([10,

In [505]:
INPUT_DIM = len(libras_sentences_source.vocab)
OUTPUT_DIM = len(portuguese_sentences_target.vocab)
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)

In [506]:
SRC_PAD_IDX = libras_sentences_source.vocab.stoi[libras_sentences_source.pad_token]
TRG_PAD_IDX = portuguese_sentences_target.vocab.stoi[portuguese_sentences_target.pad_token]

model = Seq2Seq(enc, dec, device).to(device)

In [507]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,192,822 trainable parameters


In [508]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [509]:
model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6207, 32)
    (rnn): LSTM(32, 64, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(9046, 32)
    (rnn): LSTM(32, 64, num_layers=2, dropout=0.5)
    (out): Linear(in_features=64, out_features=9046, bias=True)
  )
)

In [510]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [511]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [512]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [513]:
def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()

    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        outputs = seq2seq(batch.src, batch.trg)

        # 1. as mentioned in the seq2seq section, we will
        # cut off the first element when performing the evaluation
        # 2. the loss function only works on 2d inputs
        # with 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = batch.trg[1:].view(-1)
        loss = criterion(outputs_flatten, trg_flatten)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [514]:
def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            # turn off teacher forcing
            outputs = seq2seq(batch.src, batch.trg, teacher_forcing_ratio=0) 

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = batch.trg[1:].view(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [515]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [537]:
import math
import time


N_EPOCHS = 2
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'slt_libras.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 11s
	Train Loss: 4.695 | Train PPL: 109.423
	 Val. Loss: 4.899 |  Val. PPL: 134.123
Epoch: 02 | Time: 0m 11s
	Train Loss: 4.641 | Train PPL: 103.672
	 Val. Loss: 4.873 |  Val. PPL: 130.759


In [538]:
model.load_state_dict(torch.load('slt_libras.pt'))

test_loss = evaluate(model, test_iterator, criterion)

In [539]:
test_loss

4.873590140216119

In [540]:
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.874 | Test PPL: 130.790 |


In [541]:
def translate_sentence(model, sentence, libras, portuguese, device, max_length=50):

    if type(sentence) == str:
        tokens = tokenizer.tokenize(sentence)
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, libras.init_token)
    tokens.append(libras.eos_token)
    text_to_indices = [libras.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [portuguese.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == portuguese.vocab.stoi["<eos>"]:
            break

    translated_sentence = [portuguese.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [542]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6207, 32)
    (rnn): LSTM(32, 64, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(9046, 32)
    (rnn): LSTM(32, 64, num_layers=2, dropout=0.5)
    (out): Linear(in_features=64, out_features=9046, bias=True)
  )
)

In [543]:
print("Test sentence: ", test_data[1].src)
print("Correct translation: ", test_data[1].trg)

Test sentence:  ['ter', 'frente', 'caminho', 'longo']
Correct translation:  ['e', 'temos', 'um', 'longo', 'caminho', 'a', 'nossa', 'frente']


In [544]:
test_sentence = ["eu férias julho planejar viajar europa."]
correct_sentence = ["eu tenho férias em julho e vou planejar uma viagem `a europa."]

In [545]:
print("Translated sentence: ", translate_sentence(model, test_sentence, libras_sentences_source, portuguese_sentences_target, device))

Translated sentence:  ['e', 'o', '<unk>', '<unk>', '<unk>', '<unk>', '.', '<eos>']


In [546]:
example_idx = 5
example = val_data.examples[example_idx]
print('source sentence: ', ' '.join(example.src))
print('target sentence: ', ' '.join(example.trg))

source sentence:  médico ter paciência bom
target sentence:  meu médico é bom tem paciência .


In [547]:
src_tensor = libras_sentences_source.process([example.src]).to(device)
trg_tensor = portuguese_sentences_target.process([example.trg]).to(device)
print(trg_tensor.shape)

model.eval()
with torch.no_grad():
    outputs = model(src_tensor, trg_tensor, teacher_forcing_ratio=0)

outputs.shape

torch.Size([9, 1])


torch.Size([9, 1, 9046])

In [548]:
output_idx = outputs[1:].squeeze(1).argmax(1)
' '.join([portuguese_sentences_target.vocab.itos[idx] for idx in output_idx])

'o <unk> é o que o <unk> .'