# Seq2Seq #1
## LSTM Encoder-Decoder

In [1]:
from datasets import load_dataset

dataset = load_dataset("opus_books", "en-fr")

Found cached dataset opus_books (/home/bluesun/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [3]:
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
dataset

Loading cached split indices for dataset at /home/bluesun/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-4db7d380261ba335.arrow and /home/bluesun/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-575547be2463d3f6.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 25417
    })
})

In [4]:
from pprint import pprint

pprint(dataset['train'][100])

{'id': '58398',
 'translation': {'en': 'All this only increased the derision and hooting.',
                 'fr': 'De tout cela, les dérisions et les huées s’accrurent.'}}


### Preprocessing & Tokenization

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')

2023-03-29 15:33:52.787590: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-29 15:33:52.856548: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-29 15:33:53.172137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/bluesun/devel/lib:/opt/ros/noetic/lib:/opt/ros/noetic/lib/x86_64-linux-gnu:/home/

In [6]:
print(en_tokenizer("The, the! they're ~"))
print(en_tokenizer('All this only increased the derision and hooting.'))
print(fr_tokenizer('De tout cela, les dérisions et les huées s’accrurent.'))

['The', ',', 'the', '!', 'they', "'re", '~']
['All', 'this', 'only', 'increased', 'the', 'derision', 'and', 'hooting', '.']
['De', 'tout', 'cela', ',', 'les', 'dérisions', 'et', 'les', 'huées', 's’', 'accrurent', '.']


In [7]:
import re, string

def preprocess_text(examples):
    en_text = examples['translation']['en'].lower()
    fr_text = examples['translation']['fr'].lower()
    
    en_text = re.sub(f'[^\w\d\s{re.escape(string.punctuation)}]', ' ', en_text)
    fr_text = re.sub(f'[^\w\d\s{re.escape(string.punctuation)}]', ' ', fr_text)
    en_text = re.sub(r'\s+', ' ', en_text)
    fr_text = re.sub(r'\s+', ' ', fr_text)
    
    return {'en': en_text, 'fr': fr_text}

In [8]:
dataset = dataset.map(preprocess_text, remove_columns=['translation'])

Loading cached processed dataset at /home/bluesun/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-d39203dc4f653b77.arrow
Loading cached processed dataset at /home/bluesun/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-dc02ff54ee4d7ea6.arrow


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'en', 'fr'],
        num_rows: 101668
    })
    test: Dataset({
        features: ['id', 'en', 'fr'],
        num_rows: 25417
    })
})

In [10]:
from pprint import pprint
pprint(dataset['train'][100])

{'en': 'all this only increased the derision and hooting.',
 'fr': 'de tout cela, les dérisions et les huées s accrurent.',
 'id': '58398'}


In [11]:
# <bos> : begin of sentence
# <eos> : end of sentence
# <unk> : unknown word
# <pad> : padding

vocabs = {'en': build_vocab_from_iterator(map(en_tokenizer, dataset['train']['en']), 
                                          specials=['<unk>', '<pad>', '<bos>', '<eos>'],
                                          min_freq=3),
          'fr': build_vocab_from_iterator(map(fr_tokenizer, dataset['train']['fr']), 
                                          specials=['<unk>', '<pad>', '<bos>', '<eos>'],
                                          min_freq=3)}

vocabs['en'].set_default_index(vocabs['en']['<unk>'])
vocabs['fr'].set_default_index(vocabs['fr']['<unk>'])


If the size of the vocab is too large, it will be difficult to train the model.

In [12]:
len(vocabs['en']), len(vocabs['fr'])

(24596, 30224)

In [13]:
# tokenize and integer encode the text
import torch as th

SRC_LANG = 'fr'
TGT_LANG = 'en'

def encode_data(examples):
    src_text = examples[SRC_LANG]
    tgt_text = examples[TGT_LANG]
    
    src_tokenized = [vocabs[SRC_LANG]['<bos>']] + \
                    [vocabs[SRC_LANG][token] for token in en_tokenizer(src_text)] + \
                    [vocabs[SRC_LANG]['<eos>']]
    
    tgt_tokenized = [vocabs[TGT_LANG]['<bos>']] + \
                    [vocabs[TGT_LANG][token] for token in fr_tokenizer(tgt_text)] + \
                    [vocabs[TGT_LANG]['<eos>']]
    
    return {SRC_LANG: src_tokenized, TGT_LANG: tgt_tokenized}

In [14]:
tokenized_dataset = dataset.map(encode_data)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [15]:
print(tokenized_dataset['train'][100])

{'id': '58398', 'en': [2, 46, 39, 93, 1140, 5, 11926, 9, 11360, 6, 3], 'fr': [2, 6, 58, 116, 4, 13, 26751, 7, 13, 10947, 73, 21754, 5, 3]}


In [16]:
print([vocabs['en'].get_itos()[i] for i in tokenized_dataset['train'][100]['en']])
print([vocabs['fr'].get_itos()[i] for i in tokenized_dataset['train'][100]['fr']])

['<bos>', 'all', 'this', 'only', 'increased', 'the', 'derision', 'and', 'hooting', '.', '<eos>']
['<bos>', 'de', 'tout', 'cela', ',', 'les', 'dérisions', 'et', 'les', 'huées', 's', 'accrurent', '.', '<eos>']


In [17]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sampled_data in batch:
        src_batch.append(th.tensor(sampled_data[SRC_LANG]))
        tgt_batch.append(th.tensor(sampled_data[TGT_LANG]))
    src_batch = pad_sequence(src_batch, padding_value=vocabs[SRC_LANG]['<pad>'])
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocabs[TGT_LANG]['<pad>'])
    return src_batch, tgt_batch
        

In [18]:
from torch.utils import data

BATCH_SIZE = 64

train_dataloader = data.DataLoader(tokenized_dataset['train'], batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_dataloader = data.DataLoader(tokenized_dataset['test'], batch_size=BATCH_SIZE, collate_fn=collate_fn)

## Modeling

In [1]:
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

In [7]:
class TokenEmbedding(nn.Module):
    """
    Integer endoed tokens to embedding

    Parameters:
    -----------
    vocab_size: int
        Number of the vocabulary (= size of the dictionary)
    embed_size: int
        Size of the embedding
    """
    def __init__(self, vocab_size: int, emb_dim: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.emb_dim = emb_dim
        
    def forward(self, x):
        # x = [bs, seq_len] or [seq_len, bs]
        return self.embedding(x.long()) * np.sqrt(self.emb_dim)

In [8]:
class RNNEncoder(nn.Module):
    """
    RNN Encoder

    Parameters:
    -----------
    vocab_size: int
        Number of the vocabulary (= size of the dictionary)
    emb_dim: int
        Size of the embedding
    hidden_dim: int
        Size of the hidden dimension
    n_layers: int
        Number of the layers
    dropout: float
        Dropout rate
    """
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_layers: int,
                 dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = TokenEmbedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
    def forward(self, x):
        # x = [bs, seq_len]
        embedded = self.embedding(x)    # [bs, seq_len, emb_dim]
        hidden = self._init_hidden(x.shape[0])  # [n_layers, bs, hidden_dim]
        outputs, hidden = self.rnn(embedded, hidden)
        # outputs = [bs, seq_len, hidden_dim]
        # hidden = [n_layers * 2, bs, hidden_dim]
        return outputs, hidden
    
    def _init_hidden(self, batch_size: int):
        """
        Initialize the hidden state of the RNN to be zeros
        """
        return th.zeros(self.n_layers, batch_size, self.hidden_dim)

In [9]:
class RNNDecoder(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 emb_dim: int,
                 hidden_dim: int,
                 n_layers: int,
                 dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = TokenEmbedding(vocab_size, emb_dim)

        self.rnn = nn.GRU(emb_dim * 2, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.attn = nn.Linear(hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs, hidden, encoder_outputs):
        # x = [bs, 1]
        # hidden = [n_layers, bs, hidden_dim]
        # encoder_outputs = [bs, seq_len, hidden_dim]
        inputs = self.embedding(inputs)    # [bs, 1, emb_dim]
        attn_weights = self._compute_attention(hidden, inputs).unsqueeze(1)    # [bs, 1, seq_len]
        context = attn_weights.bmm(inputs)    # [bs, 1, hidden_dim]
        context = th.cat((inputs, context), dim=2)    # [bs, 1, hidden_dim * 2]
        outputs, hidden = self.rnn(context, hidden)
        # outputs = [bs, 1, hidden_dim]
        # hidden = [n_layers, bs, hidden_dim]
        outputs = outputs.squeeze(1)    # [bs, hidden_dim]
        outputs = self.fc(outputs)    # [bs, vocab_size]
        return outputs, hidden, attn_weights
    
    def _compute_attention(self, outputs: th.Tensor, encoder_outputs: th.Tensor):
        """
        Compute the bahdanau attention

        Parameters:
        -----------
        outputs: th.Tensor: [bs, 1, hidden_dim]
            The outputs of the (decoder) RNN -> Query
        encoder_outputs: th.Tensor: [bs, seq_len, hidden_dim]
            The outputs of the encoder RNN -> Keys and Values
        """
        # outputs = [bs, 1, hidden_dim]
        # encoder_outputs = [bs, seq_len, hidden_dim]
        seq_len = encoder_outputs.shape[1]
        hidden_dim = encoder_outputs.shape[2]
        outputs = outputs.repeat(1, seq_len, 1)     # [bs, seq_len, hidden_dim]
        attn = self.attn(th.tanh(outputs + encoder_outputs)).squeeze(2)
        # attn = [bs, seq_len]
        return F.softmax(attn, dim=1)


In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder: RNNEncoder,
                 decoder: RNNDecoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, teacher_forcing=0.5):
        """
        Parameters:
        -----------
        src: th.Tensor: [bs, seq_len]
            The source sentence
        tgt: th.Tensor: [bs, seq_len]
            The target sentence
        teacher_forcing: float
            The probability of using teacher forcing
        """
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc.out_features
        outputs = th.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)
        encoder_outputs, hidden = self.encoder(src)
        # encoder_outputs = [bs, seq_len, hidden_dim]
        # hidden = [n_layers, bs, hidden_dim]
        
        # First input to the decoder is the <sos> token
        inputs = tgt[:, 0].unsqueeze(1)
        for t in range(1, tgt_len):
            output, hidden, _ = self.decoder(inputs, hidden, encoder_outputs)
            outputs[:, t] = output
            teacher_force = th.random(1).item() < teacher_forcing
            top1 = output.argmax(1, keepdim=True)
            inputs = tgt[:, t].unsqueeze(1) if teacher_force else top1
        return outputs