In [None]:
!wget https://www.ivan-sipiran.com/downloads/nmt_data.zip

In [None]:
!unzip nmt_data.zip

In [None]:
!mkdir data
!mv *.txt data/

In [None]:
from typing import List, Dict, Tuple, Sequence, Any
from collections import Counter
from itertools import chain
import random

from collections import defaultdict
import numpy as np
import datetime


import torch

#Función que preprocesa las secuencias de entrada y retorna secuencias de índices numéricos
def preprocess(
    raw_src_sentence: List[str], #Secuencia source original
    raw_trg_sentence: List[str], #Secuencia target original
    src_word2idx: Dict[str, int], #Diccionario de idioma source
    trg_word2idx: Dict[str, int], #Diccionario de idioma target
    max_len: int #Máxima longitud de secuencia
) -> Tuple[List[int], List[int]]: # Retorna dos listas que representan las secuencias, pero con números
    """ Sentence preprocessor for Seq2Seq with Attention
    Before training the model, you should preprocess the data to feed Seq2Seq.
    Implement preprocessor with following rules.

    Preprocess Rules:
    1. All words should be converted into thier own index number by word2idx.
    1-1. If there is no matched word in word2idx, you should replace that word by <UNK> token.
    1-2. You have to use matched word2idx for each source/target language.

    2. You have to insert <SOS>, <EOS> tokens properly into the target sentence.
    2-1. You don't need them in source sentence.

    3. The length of preprocessed sentences should not exceed max_len.
    3-1. If the lenght of the sentence exceed max_len, you must truncate the back of the sentence.

    Arguments:
    raw_src_sentence -- raw source sentence without any modification
    raw_trg_sentence -- raw target sentence without any modification 
    src_word2idx -- dictionary for source language which maps words to their unique numbers
    trg_word2idx -- dictionary for target language which maps words to their unique numbers
    max_len -- maximum length of sentences

    Return:
    src_sentence -- preprocessed source sentence
    trg_sentence -- preprocessed target sentence

    """
    # Special tokens, use these notations if you want
    UNK = Language.UNK_TOKEN_IDX # Índice para tokens desconocidos (que no están en el diccionario)
    SOS = Language.SOS_TOKEN_IDX # Índice para inicio de secuencias
    EOS = Language.EOS_TOKEN_IDX # Índice para fin de secuencias

    #Se declaran las listas de salida
    src_sentence: List[int] = None 
    trg_sentence: List[int] = None

    # Truncar secuencias. 
    # La secuencia origen se trunca a max_len palabras
    # La secuencia target se trunca a max_len-2 palabras, eso es porque además hay que agregar <SOS> y <EOS> a la secuencia
    truncated_src_sentence = raw_src_sentence[:max_len] if len(raw_src_sentence) > max_len else raw_src_sentence
    truncated_tgt_sentence = raw_trg_sentence[:max_len-2] if len(raw_trg_sentence) > max_len-2 else raw_trg_sentence

    # Convertir las secuencias a números. Palabras desconocidas se identifican con <UNK>
    src_sentence = [src_word2idx[word] if word in src_word2idx.keys() else UNK for word in truncated_src_sentence]
    trg_sentence = [trg_word2idx[word] if word in trg_word2idx.keys() else UNK for word in truncated_tgt_sentence]

    # Insertar <SOS> y <EOS> a la secuencia target
    trg_sentence.insert(0, SOS)
    trg_sentence.append(EOS)

    #Retornar las secuencias numéricas
    return src_sentence, trg_sentence


def bucketed_batch_indices(
        sentence_length: List[Tuple[int, int]],
        batch_size: int,
        max_pad_len: int
) -> List[List[int]]:
    """ Function for bucketed batch indices
    Although the loss calculation does not consider PAD tokens, it actually takes up GPU resources and degrades performance.
    Therefore, the number of <PAD> tokens in a batch should be minimized in order to maximize GPU utilization.
    Implement a function which groups samples into batches that satisfy the number of needed <PAD> tokens in each sentence is less than or equals to max_pad_len.

    Note 1: several small batches which have less samples than batch_size are okay but should not be many. If you pass the test, it means "okay".

    Note 2: you can directly apply this function to torch.utils.data.dataloader.DataLoader with batch_sampler argument.
    Read the test codes if you are interested in.

    Hint 1: The most easiest way for bucketing is sort-and-batch manner, but soon you will realize this doesn't work for this time.
    The second easiest way is binning, however one-dimensional binning is not enough because there are two sentences per a sample.

    Hint 2: defaultdict in collections library might be useful.

    Arguments:
    sentence_length -- list of (length of source_sentence, length of target_sentence) pairs.
    batch_size -- batch size
    max_pad_len -- maximum padding length. The number of needed <PAD> tokens in each sentence should not exceed this number.

    return:
    batch_indices_list -- list of indices to be a batch. Each element should contain indices of sentence_length list.

    Example:
    If sentence_length = [7, 4, 9, 2, 5, 10], batch_size = 3, and max_pad_len = 3,
    then one of the possible batch_indices_list is [[0, 2, 5], [1, 3, 4]]
    because [0, 2, 5] indices has simialr length as sentence_length[0] = 7, sentence_length[2] = 9, and sentence_length[5] = 10.
    """

    batch_indices_list: List[List[int]] = None
    sentence_length = torch.LongTensor(sentence_length)
    min_dim0 = min(sentence_length[:,0])
    max_dim0 = max(sentence_length[:,0])
    min_dim1 = min(sentence_length[:,1])
    max_dim1 = max(sentence_length[:,1])
    dim0_bin = list(range(min_dim0, max_dim0, max_pad_len))
    dim1_bin = list(range(min_dim1, max_dim1, max_pad_len))

    bin =  []
    for i in dim0_bin:
        for j in dim1_bin:
            elem = (i,j)
            bin.append(torch.LongTensor(elem))
    bin = torch.stack(bin)
    diff = []
    for i in range(len(bin)):
        diff.append(sentence_length-bin[i])
    diff = torch.stack(diff)
    mask_dim1 = (diff[:, :, 0] <= 5) * (0 <= diff[:, :, 0])
    mask_dim2 = (diff[:, :, 1] <= 5) * (0 <= diff[:, :, 1])
    mask = mask_dim1 * mask_dim2
    mask = mask.transpose(0,1)
    bin_id_per_sentence = [int(mask[i].nonzero()[0]) for i in range(len(mask))]
    sentence_idx = range(len(sentence_length))
    dict = defaultdict(list)
    for d0, d1 in zip(bin_id_per_sentence,sentence_idx):
        dict[d0].append(d1)

    list1 = dict.values()
    list2 = []
    for bin in list1:
        n_batches = len(bin)//batch_size
        namuji = len(bin)%batch_size
        tmp_list = [bin[i*batch_size:(i+1)*batch_size] for i in range(n_batches)]
        list2 += tmp_list
        if namuji != 0:
            last_batch = bin[n_batches*batch_size:]
            list2.append(last_batch)

    batch_indices_list = list2

    # Don't forget shuffling batches because length of each batch could be biased
    random.shuffle(batch_indices_list)

    return batch_indices_list


def collate_fn(
    batched_samples: List[Tuple[List[int], List[int]]]
) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Collate function
    Because each sentence has variable length, you should collate them into one batch with <PAD> tokens.
    Implement collate_fn function which collates source/target sentence into source/target batchs appending <PAD> tokens behind
    Meanwhile, for the convenience of latter implementations, you should sort the sentences within a batch by its source sentence length in descending manner.

    Note 1: if you are an expert on time-series data, you may know a tensor of [sequence_length, batch_size, ...] is much faster than [batch_size, sequence_length, ...].
    However, for simple intuitive understanding, let's just use batch_first this time.

    Note 2: you can directly apply this function to torch.utils.data.dataloader.DataLoader with collate_fn argument.
    Read the test codes if you are interested in.

    Hint: torch.nn.utils.rnn.pad_sequence would be useful

    Arguments:
    batched_samples -- list of (source_sentence, target_sentence) pairs. This list should be converted to a batch

    Return:
    src_sentences -- batched source sentence
                        in shape (batch_size, max_src_sentence_length)
    trg_sentences -- batched target sentence
                        in shape (batch_size, max_trg_sentence_length)

    """
    PAD = Language.PAD_TOKEN_IDX
    batch_size = len(batched_samples)

    src_sentences: torch.Tensor = None
    trg_sentences: torch.Tensor = None
    src = [sample[0] for sample in batched_samples]
    tgt = [sample[1] for sample in batched_samples]
    src_lengths = [len(sample) for sample in src]
    tgt_lengths = [len(sample) for sample in tgt]
    max_src_len = max(src_lengths)
    max_tgt_len = max(tgt_lengths)
    for i in range(len(batched_samples)):
        src[i] = np.asarray(src[i], dtype=np.int64)
        src[i] = np.pad(src[i], (0, max_src_len-len(src[i])), mode="constant", constant_values=PAD)
        tgt[i] = np.asarray(tgt[i], dtype=np.int64)
        tgt[i] = np.pad(tgt[i], (0, max_tgt_len-len(tgt[i])), mode="constant", constant_values=PAD)

    src_sentences = torch.LongTensor(src)
    trg_sentences = torch.LongTensor(tgt)

    _, sorted_indices = torch.sort(torch.LongTensor(src_lengths), dim=0, descending=True)
    src_sentences = src_sentences.index_select(0, sorted_indices)
    trg_sentences = trg_sentences.index_select(0, sorted_indices)

    assert src_sentences.shape[0] == batch_size and trg_sentences.shape[0] == batch_size
    assert src_sentences.dtype == torch.long and trg_sentences.dtype == torch.long
    return src_sentences, trg_sentences

#Clase para un Idioma. Básicamente contiene los diccionarios para convertir palabras a índices y viceversa.
class Language(Sequence[List[str]]):
    PAD_TOKEN = '<PAD>'
    PAD_TOKEN_IDX = 0
    UNK_TOKEN = '<UNK>'
    UNK_TOKEN_IDX = 1
    SOS_TOKEN = '<SOS>'
    SOS_TOKEN_IDX = 2
    EOS_TOKEN = '<EOS>'
    EOS_TOKEN_IDX = 3

    def __init__(self, path: str) -> None:
        with open(path, mode='r', encoding='utf-8') as f:
            self._sentences: List[List[str]] = [line.split() for line in f]

        self.word2idx: Dict[str, int] = None
        self.idx2word: List[str] = None
    
    def build_vocab(self, min_freq: int=2) -> None:
        SPECIAL_TOKENS: List[str] = [Language.PAD_TOKEN, Language.UNK_TOKEN, Language.SOS_TOKEN, Language.EOS_TOKEN]
        self.idx2word = SPECIAL_TOKENS + [word for word, count in Counter(chain(*self._sentences)).items() if count >= min_freq]
        self.word2idx = {word: idx for idx, word in enumerate(self.idx2word)}
    
    def set_vocab(self, word2idx: Dict[str, int], idx2word: List[str]) -> None:
        self.word2idx = word2idx
        self.idx2word = idx2word
    
    def __getitem__(self, index: int) -> List[str]:
        return self._sentences[index]
    
    def __len__(self) -> int:
        return len(self._sentences)

#Clase para un dataset de traducción. Recibe dos lenguajes.
class NmtDataset(Sequence[Tuple[List[int], List[int]]]):
    def __init__(self, src: Language, trg: Language, max_len: int=30) -> None:
        assert len(src) == len(trg)
        assert src.word2idx is not None and trg.word2idx is not None

        self._src = src
        self._trg = trg
        self._max_len = max_len

    #Preprocesa dos sentencias y las retorna
    def __getitem__(self, index: int) -> Tuple[List[str], List[str]]:
        return preprocess(self._src[index], self._trg[index], self._src.word2idx, self._trg.word2idx, self._max_len)

    def __len__(self) -> int:
        return len(self._src)

In [None]:
from abc import ABC, abstractmethod
import random

import torch 
import torch.nn as nn

class AttentionBase(nn.Module, ABC):
    """ Base attention class
    You don't need to modify anything in this class
    """
    @abstractmethod
    def forward(self, encoder_hidden: torch.Tensor, encoder_mask: torch.Tensor, decoder_hidden: torch.Tensor, decoder_mask: torch.Tensor):
        """ Abstract attention forward function
        Your forward function should follow below arguments & returns
        For ploting, you should return attention distribution result.

        Parameters:
        encoder_hidden -- encoder_hidden is encoder hidden state which is same with h^enc in the handout 
                            in shape (batch_size, encoder_sequence_length, hidden_dim)
                            All values in last dimension (hidden_dim dimension) are zeros for <PAD> location.
        encoder_mask -- encoder_mask is <PAD> mask for encoder
                            in shape (batch_size, sequence_length) with torch.bool type
                            True for <PAD> and False for non-<PAD>
                            Same with (encoder_hidden == 0.).all(-1)
        decoder_hidden -- decoder_hidden is decoder hidden state which is same with h^dec_t in the handout
                            in shape (batch_size, hidden_dim)
                            All values in last dimension (hidden_dim dimension) are zeros for <PAD> location.
        decoder_mask -- decoder_mask is <PAD> mask for decoder
                            in shape (batch_size, ) with torch.bool type
                            True for <PAD> and False for non-<PAD>
                            Same with (decoder_hidden == 0.).all(-1)

        Return:
        attention(context) -- attention is the result of attention which same with a_t in the handout
                            in shape (batch_size, hidden_dim)
        distribution(attenetion weights) -- distribution is the attention distribution same with alpha_t in the handout
                            in shape (batch_size, encoder_sequence_length)
        """
        pass

#Atención basada en producto interno
class DotAttention(AttentionBase):
    def forward(self,
        encoder_hidden: torch.Tensor, #Tensor con todos los estados ocultos del encoder
        encoder_mask: torch.Tensor, #Máscara que indica en dónde hay tokens <PAD>
        decoder_hidden: torch.Tensor,
        decoder_mask: torch.Tensor
    ):
        """ Dot product attention
        Implement dot product attention which compresses encoder_output on sequence_length axis by decoder_output

        Note 1: output should attent only on non-<PAD> encoder_output words

        Hint: the easiest way to make SOMETHING to zero probability is 
        setting results of SOMETHING to -infinity by "result[SOMETHING] = float('-inf')" and do softmax on that dimension.

        Parameters / Returns: same as forward function in Attention base class
        """
        batch_size, sequence_length, hidden_dim = encoder_hidden.shape

        assert (encoder_mask == (encoder_hidden == 0.).all(-1)).all()
        assert (decoder_mask == (decoder_hidden == 0.).all(-1)).all()

        attention: torch.Tensor = None #Resultado de la atención
        distribution: torch.Tensor = None #Distribución de atención
        attention_energy = encoder_hidden.bmm(decoder_hidden.unsqueeze(-1)).squeeze(2) # Producto interno entre estados ocultos del encoder y estado oculto del decoder
        attention_energy[encoder_mask] = float('-inf') #Aquellas posiciones con máscara se cambian por -Inf
        distribution = torch.nn.functional.softmax(attention_energy, dim=1) #Se calcula softmax. Los valores con -Inf -> cero
        attention = encoder_hidden.transpose(1, 2).bmm(distribution.unsqueeze(-1)).squeeze(2) #Vector de atención es la suma ponderada de estados ocultos con pesos de distribución de atención

        assert attention.shape == torch.Size([batch_size, hidden_dim])
        assert distribution.shape == torch.Size([batch_size, sequence_length])

        # Don't forget setting results of decoder <PAD> token values to zeros.
        # This would be helpful for debuging and other implementation details.
        attention[decoder_mask, :] = 0. #Si el decoder está procesando un <PAD>, se anula la atención

        return attention, distribution

#Atención aditiva
class ConcatAttention(AttentionBase):
    def __init__(self, hidden_dim):
        """ Concat attention initializer
        Because there are variables in concat attention, you would need following attributes.
        Use these attributes in forward function

        Attributes:
        W_a -- Attention weight in the handout
                in shape (hidden_dim, 4 * hidden_dim)
        v_a -- Attention vector in the handout
                in shape (hidden_dim, )
        """
        super().__init__()

        self.W_a = nn.Parameter(torch.empty([hidden_dim, 4 * hidden_dim]))
        self.v_a = nn.Parameter(torch.empty([hidden_dim]))

        self.init_weights()

    def init_weights(self):
        nn.init.normal_(self.W_a.data)
        nn.init.normal_(self.v_a.data)

    def forward(self,
        encoder_hidden: torch.Tensor,
        encoder_mask: torch.Tensor,
        decoder_hidden: torch.Tensor,
        decoder_mask: torch.Tensor
    ):
        """ Concat attention forward function
        Implement concat attention which compresses encoder_output on sequence_length axis by decoder_output

        Parameters / Returns: same as forward function in Attention base class
        """
        batch_size, sequence_length, hidden_dim = encoder_hidden.shape

        assert (encoder_mask == (encoder_hidden == 0.).all(-1)).all()
        assert (decoder_mask == (decoder_hidden == 0.).all(-1)).all()

        attention: torch.Tensor = None
        distribution: torch.Tensor = None

        decoder_hidden_expand = decoder_hidden.unsqueeze(dim=1).expand(batch_size,sequence_length,hidden_dim)
        concat_hidden = torch.cat((encoder_hidden, decoder_hidden_expand), dim=-1) #Se concatenan los estados ocultos
        C = concat_hidden.matmul(self.W_a.transpose(0,1)) #Se multiplican por la matriz W
        tanh_C = torch.tanh(C) # Se calcula Tanh
        attention_energy = tanh_C.matmul(self.v_a) # Se multiplican por V
        attention_energy[encoder_mask] = float('-inf') # Se anulan los datos con máscara
        distribution = torch.nn.functional.softmax(attention_energy, dim=1) #Softmax
        attention = encoder_hidden.transpose(1, 2).bmm(distribution.unsqueeze(-1)).squeeze(-1) #Cálculo de atención
       
        if attention.shape != torch.Size([batch_size, hidden_dim]):
            assert True
        assert distribution.shape == torch.Size([batch_size, sequence_length])

        # Don't forget setting results of decoder <PAD> token values to zeros.
        # This would be helpful for debuging and other implementation details.
        attention[decoder_mask, :] = 0.

        return attention, distribution

In [None]:
import random

import torch 
import torch.nn as nn


import numpy as np

# Creamos la red neuronal para Neural Machine Translation
# Recibe los dos idiomas, el tipo de atención
class Seq2Seq(torch.nn.Module):
    def __init__(self, src: Language, trg: Language, attention_type: str, embedding_dim: int=128, hidden_dim: int=64):
        """ Seq2Seq with Attention model

        Parameters:
        src -- source language vocabs
        trg -- target language vocabs
        attention_type -- internal attention type: 'dot' or 'concat'
        embeding_dim -- embedding dimension
        hidden_dim -- hidden dimension
        """
        super().__init__()
        PAD = Language.PAD_TOKEN_IDX
        SRC_TOKEN_NUM = len(src.idx2word) # Including <PAD>
        TRG_TOKEN_NUM = len(trg.idx2word) # Including <PAD>

        ### Declare Embedding Layers
        # Doc for Embedding Layer: https://pytorch.org/docs/stable/nn.html#embedding 
        #
        # Note: You should set padding_idx options to embed <PAD> tokens to 0 values 
        self.src_embedding: nn.Embedding = None
        self.trg_embedding: nn.Embedding = None

        self.src_embedding = nn.Embedding(SRC_TOKEN_NUM, embedding_dim=embedding_dim, padding_idx=PAD)
        self.trg_embedding = nn.Embedding(TRG_TOKEN_NUM, embedding_dim=embedding_dim, padding_idx=PAD)

        ### Declare LSTM/LSTMCell Layers
        # Doc for LSTM Layer: https://pytorch.org/docs/stable/nn.html#lstm
        # Doc for LSTMCell Layer: https://pytorch.org/docs/stable/nn.html#lstmcell
        # Explanation for bidirection RNN in torch by @ceshine_en (English): https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
        #
        # Note 1: Don't forget setting batch_first option because our tensor follows [batch_size, sequence_length, ...] form.
        # Note 2: Use one layer LSTM with bias for encoder & decoder
        self.encoder: nn.LSTM = None
        self.decoder: nn.LSTMCell = None

        #El encoder es un LSTM de una sola capa y bidireccional
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bias=True, batch_first=True, bidirectional=True)
        self.decoder = nn.LSTMCell(embedding_dim, hidden_dim*2, bias=True) #El decoder es una celda LSTM que va procesando la secuencia paso a paso
        # Attention Layer
        if attention_type == 'dot':
            self.attention: AttentionBase = DotAttention()
        elif attention_type == 'concat':
            self.attention: AttentionBase = ConcatAttention(hidden_dim)

        ### Declare output Layers
        # Output layers to attain combined-output vector o_t in the handout
        # Doc for Sequential Layer: https://pytorch.org/docs/stable/nn.html#sequential
        # Doc for Linear Layer: https://pytorch.org/docs/stable/nn.html#linear 
        # Doc for Tanh Layer: https://pytorch.org/docs/stable/nn.html#tanh 
        #
        # Note: Shape of combined-output o_t should be (batch_size, TRG_TOKEN_NUM - 1) because we will exclude the probability of <PAD>
        self.output: nn.Sequential = None

        #La MLP de salida termina con una capa de tantas neuronas como tokens hay en en el idioma target (básicamente es una clasificación de palabras)
        self.output = nn.Sequential(
            nn.Linear(hidden_dim * 4, hidden_dim, bias=False),
            nn.Tanh(),
            nn.Linear(hidden_dim, TRG_TOKEN_NUM - 1, bias=False)
        )
        

    def forward(self, src_sentences: torch.Tensor, trg_sentences: torch.Tensor, teacher_force: float=.5):
        """ Seq2Seq forward function
        
        Parameters:
        src_sentences -- batched source sentences
                            in shape (batch_size, sentence_length)
        trg_sentences -- batched target sentences
                            in shape (batch_size, sentence_length)
        teacher_force -- the probability of teacher forcing

        Return:
        loss -- average loss per a non-<PAD> word
        """
        # You may use below notations
        batch_size = src_sentences.shape[0]
        PAD = Language.PAD_TOKEN_IDX
        SOS = Language.SOS_TOKEN_IDX
        EOS = Language.EOS_TOKEN_IDX

        encoder_masks = src_sentences == PAD #La máscara del encoder tiene 1's en donde haya <PAD> en la secuencia de entrada

        ### Encoder part (~7 lines)
        # We strongly recommand you to use torch.nn.utils.rnn.pack_padded_sequence/pad_packed_sequence to deal with <PAD> and boost the performance.
        # Doc for pack_padded_sequence: https://pytorch.org/docs/stable/nn.html#pack-padded-sequence
        # Doc for pad_packed_sequence: https://pytorch.org/docs/stable/nn.html#pad-packed-sequence
        # Because you have already sorted sentences at collate_fn, you can use pack_padded_sequence without any modification.
        #
        # Variable:
        # encoder_hidden -- encoder_hidden is encoder hidden state which is same with h^enc in the handout
        #                   in shape (batch_size, sequence_length, hidden_dim)
        #                   All values in last dimension (hidden_dim dimension) are zeros for <PAD> location.
        # hidden_state -- Last encoder hidden state
        #                   in shape (batch_size, hidden_dim * 2)
        # cell_state -- Last encoder cell state
        #                   in shape (batch_size, hidden_dim * 2)
        encoder_hidden: torch.Tensor = None
        hidden_state: torch.Tensor = None
        cell_state: torch.Tensor = None

        src_embedding_seq = self.src_embedding(src_sentences) #Se le aplica el embedding a la sentencia source
        src_lengths = (src_sentences != 0).sum(dim=1).cpu()
        packed_seq = torch.nn.utils.rnn.pack_padded_sequence(src_embedding_seq, src_lengths, batch_first=True, enforce_sorted=True)

        encoder_hidden_states, (hidden_state, cell_state) = self.encoder(packed_seq) #Se ejecuta el encoder
        encoder_hidden, _ = torch.nn.utils.rnn.pad_packed_sequence(encoder_hidden_states,
                                                                                 batch_first=True, padding_value=PAD)
        encoder_hidden[encoder_masks] = 0.

        # Loss initialize
        decoder_out = trg_sentences.new_full([batch_size], fill_value=SOS) #La primera entrada del decoder es <SOS>
        decoder_c0 = torch.cat((cell_state[0], cell_state[1]), dim=1) #Primera entrada de memoria C para el encoder
        decoder_h0 = torch.cat((hidden_state[0], hidden_state[1]), dim=1) #Primera entrada de H para el encoder
        sum_of_loss = 0.
        ce_loss = nn.CrossEntropyLoss(ignore_index=PAD, reduction='sum') #CEL no se aplica sobre elementos con <PAD>
        #Este bucle recorre cada palabra del target
        for trg_word_idx in range(trg_sentences.shape[1] - 1):
            # Teacher forcing: feed correct labels with a probability of teacher_force
            #Se decide si usar la salida previa del decoder o la palabra de la secuencia target: Teacher forcing
            decoder_input = trg_sentences[:, trg_word_idx] if torch.distributions.bernoulli.Bernoulli(teacher_force).sample() else decoder_out 
            decoder_input_embedding = self.trg_embedding(decoder_input)  # Se calcula el embedding de la palabra actual
            decoder_h0, decoder_c0 = self.decoder(decoder_input_embedding, (decoder_h0, decoder_c0)) #La entrada a la celda LSTM es el embedding, la memoria y el estado oculto H
            decoder_mask = trg_sentences[:, trg_word_idx + 1] == PAD #Se anulan los datos que pertenecen a <PAD>
            decoder_h0[decoder_mask] = 0.

            #Se calcula la atención entre los estados ocultos del encoder y el estado oculto actual del decoder
            attention_output, distribution = self.attention(encoder_hidden, encoder_masks, decoder_h0, decoder_mask)

            #Se concatenan el estado oculto y la salida de la atención
            output_layer_input = torch.cat((decoder_h0, attention_output), dim=1) 
            output_logit = self.output(output_layer_input) #Se ejecuta el MLP para generar la salida de la actual palabra

            # You may use below notations
            decoder_target = trg_sentences[:, trg_word_idx+1] #Esta es la palabra que debería haber salido
            decoder_out = torch.argmax(output_logit, dim=1) + 1 #Se calcula la predicción
            tmp = output_logit.new_full((batch_size, 1), fill_value=float('-inf'))
            new_logit = torch.cat((tmp, output_logit), dim=1) 
            loss = ce_loss(new_logit, decoder_target) #Se calcula el loss
            sum_of_loss += loss #Se acumula el loss

        loss = sum_of_loss
        assert loss.shape == torch.Size([])
        return loss / (trg_sentences[:, 1:] != PAD).sum() # Return average loss per a non-<PAD> word

    def translate(self, sentence: torch.Tensor, max_len: int=30):
        """
        Parameters:
        sentence -- sentence to be translated
                        in shape (sentence_length, )
        max_len -- maximum word length of the translated stentence

        Return:
        translated -- translated sentence
                        in shape (translated_length, ) of which translated_length <= max_len
                        with torch.long type
        distrubutions -- stacked attention distribution
                        in shape (translated_length, sentence_length)
                        This is used for ploting
        """
        PAD = Language.PAD_TOKEN_IDX
        SOS = Language.SOS_TOKEN_IDX
        EOS = Language.EOS_TOKEN_IDX
        sentence_length = sentence.size(0)

        # Note: use argmax to get the next input word
        translated =[]
        distributions = []

        src_embedding_seq = self.src_embedding(sentence).unsqueeze(0)
        src_lengths = sentence_length

        encoder_hidden_states, (hidden_state, cell_state) = self.encoder(src_embedding_seq)
        encoder_hidden=encoder_hidden_states
        
        # Loss initialize
        decoder_out = sentence.new_full([1], fill_value=SOS)
        decoder_c0 = torch.cat((cell_state[0], cell_state[1]), dim=1)
        decoder_h0 = torch.cat((hidden_state[0], hidden_state[1]), dim=1)
        encoder_masks = (sentence == PAD).unsqueeze(0)
        decoder_mask = False
        for trg_word_idx in range(max_len):
            decoder_input = decoder_out
            decoder_input_embedding = self.trg_embedding(decoder_input)  # y_t
            decoder_h0, decoder_c0 = self.decoder(decoder_input_embedding, (decoder_h0, decoder_c0))

            decoder_h0[decoder_mask] = 0.
            attention_output, distribution = self.attention(encoder_hidden, encoder_masks, decoder_h0, decoder_mask)

            output_layer_input = torch.cat((decoder_h0, attention_output), dim=1)
            output_logit = self.output(output_layer_input)

            decoder_out = torch.argmax(output_logit, dim=1) + 1
            translated.append(decoder_out)
            distributions.append(distribution.squeeze(0))
            if decoder_out == EOS:
                break
        translated = torch.stack(translated).squeeze(1)
        distributions = torch.stack(distributions)
        
        assert translated.dim() == 1 and distributions.shape == torch.Size([translated.size(0), sentence_length])
        return translated, distributions

In [None]:
from typing import List

import torch
import datetime
import torch.utils
import random
from tqdm import tqdm, trange

import matplotlib
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

attention_type = 'concat' # 'dot' or 'concat'
embedding_dim = 128
hidden_dim = 64
bucketing = True

def plot_attention(attention: torch.Tensor, trg_text: List[str], src_text: List[str], name: str):
    assert attention.shape[0] == len(trg_text) and attention.shape[1] == len(src_text)
    _, ax = plt.subplots()
    _ = ax.pcolor(attention)

    ax.set_xticks([tick + .5 for tick in range(len(src_text))], minor=False)
    ax.set_yticks([tick + .5 for tick in range(len(trg_text))], minor=False)

    ax.invert_yaxis()
    ax.xaxis.tick_top()
    ax.set_xticklabels(src_text, rotation=90, minor=False)
    ax.set_yticklabels(trg_text, minor=False)
    plt.savefig('attention_' + name + '.png')

def load_model():
    french = Language(path='data/train.fr.txt')
    english = Language(path='data/train.en.txt')
    french.build_vocab()
    english.build_vocab()
    dataset = NmtDataset(src=french, trg=english)
    model = Seq2Seq(french, english, attention_type=attention_type,
                    embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device)
    model.load_state_dict(torch.load('/home/admin/projects/ai605/assn2/seq2seq_concat.pth'))
    model.eval()

def train():
    max_epoch = 200
    batch_size = 256

    french = Language(path='data/train.fr.txt')
    english = Language(path='data/train.en.txt')
    french.build_vocab()
    english.build_vocab()
    dataset = NmtDataset(src=french, trg=english)

    max_pad_len = 5
    sentence_length = list(map(lambda pair: (len(pair[0]), len(pair[1])), dataset))
    batch_sampler = bucketed_batch_indices(sentence_length, batch_size=batch_size, max_pad_len=max_pad_len) if bucketing else None

    model = Seq2Seq(french, english, attention_type=attention_type, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters())
    dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn, num_workers=2, batch_size=1 if bucketing else batch_size, batch_sampler=batch_sampler, shuffle=not bucketing)
    
    loss_log = tqdm(total=0, bar_format='{desc}', position=0)
    for epoch in trange(max_epoch, desc="Epoch", position=0):
        for src_sentence, trg_sentence in tqdm(dataloader, desc="Iteration", position=0):
            optimizer.zero_grad()
            src_sentence, trg_sentence = src_sentence.to(device), trg_sentence.to(device)
            loss = model(src_sentence, trg_sentence, teacher_force=0.5)
            loss.backward()
            optimizer.step()

            des = 'Loss per a non-<PAD> Word: {:06.4f}'.format(loss.cpu())
            loss_log.set_description_str(des)
    
    torch.save(model.state_dict(), "seq2seq_" + attention_type + ".pth")

def translate():
    SOS = Language.SOS_TOKEN_IDX
    EOS = Language.EOS_TOKEN_IDX

    french_train = Language(path='data/train.fr.txt')
    english_train = Language(path='data/train.en.txt')
    french_train.build_vocab()
    english_train.build_vocab()
    model = Seq2Seq(french_train, english_train, attention_type=attention_type,
                    embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device)
    model.load_state_dict(torch.load("seq2seq_" + attention_type + ".pth", map_location=device))

    french_test = Language(path='data/test.fr.txt')
    english_test = Language(path='data/test.en.txt')
    french_test.set_vocab(french_train.word2idx, french_train.idx2word)
    english_test.set_vocab(english_train.word2idx, english_train.idx2word)
    dataset = NmtDataset(src=french_test, trg=english_test)
    
    samples = [dataset[16][0], dataset[1][0], dataset[2][0]] # You may choose your own samples to plot

    for i, french in enumerate(samples):
        translated, attention = model.translate(torch.Tensor(french).to(dtype=torch.long, device=device))
        source_text = [french_train.idx2word[idx] for idx in french]
        translated_text = [english_train.idx2word[idx] for idx in translated]
        plot_attention(attention.cpu().detach(), translated_text, source_text, name=attention_type + '_' + str(i))

    f = open('translated.txt', mode='w', encoding='utf-8')
    f_bleu = open('pred.en.txt', mode='w', encoding='utf-8')
    for french, english in tqdm(dataset, desc='Translated'):
        translated, attention = model.translate(torch.Tensor(french).to(dtype=torch.long, device=device))
        source_text = [french_train.idx2word[idx] for idx in french]
        target_text = [english_train.idx2word[idx] for idx in english if idx != SOS and idx != EOS]
        translated_text = [english_train.idx2word[idx] for idx in translated if idx != EOS]

        f.write('French    : ' + ' '.join(source_text) + '\n')
        f.write('English   : ' + ' '.join(target_text) + '\n')
        f.write('Translated: ' + ' '.join(translated_text) + '\n\n')
        f_bleu.write(' '.join(translated_text) + '\n')
    f.close()
    f_bleu.close()

In [None]:
torch.set_printoptions(precision=8)
random.seed(4321)
torch.manual_seed(4321)
print(datetime.datetime.now())
train()
print(datetime.datetime.now())
translate()