In [3]:
import pandas as pd
import numpy as np

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
! pip install torchtext

# Data Preprocessing

We use the English-French translation data downloaded last week.

In [None]:
cleaned_en = pd.read_pickle('../wk14/data/cleaned_en.pkl')
cleaned_fr = pd.read_pickle('../wk14/data/cleaned_fr.pkl')

en_fr = pd.DataFrame({'en': cleaned_en, 'fr': cleaned_fr})
en_fr

In [None]:
((en_fr['en'].apply(len) < 1) | (en_fr['fr'].apply(len) < 1)).value_counts()

In [None]:
en_fr.drop(en_fr[(en_fr['en'].apply(len) < 1) | (en_fr['fr'].apply(len) < 1)].index, inplace=True)
((en_fr['en'].apply(len) < 1) | (en_fr['fr'].apply(len) < 1)).value_counts()

In [None]:
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')

In [None]:
import pickle
from sklearn.model_selection import train_test_split

def train_valid_test_split(df, train_size=0.8, valid_size=0.1, test_size=0.1, random_state=None, shuffle=True):
    """
    Split the dataset into train, valid, and test set.
    """
    if shuffle is True:
        df = df.sample(frac=1, random_state=random_state)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    train_df, valid_df = train_test_split(train_df, test_size=valid_size/(train_size+valid_size), random_state=random_state)
    return train_df, valid_df, test_df


def build_vocab(df, tokenizer, specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=2):
    """
    Build vocabulary from the dataset.
    :param specials:
        - <unk>: unknown token
        - <pad>: padding token
        - <bos>: beginning of sentence token
        - <eos>: end of sentence token
    :param min_freq: minimum frequency of the token to be included in the vocabulary.
    :return:
    """
    vocab = build_vocab_from_iterator(map(tokenizer, df), specials=specials, min_freq=min_freq)
    vocab.set_default_index(vocab['<unk>'])
    return vocab


def save_data(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)
        print(f'Data saved to {path}')

### Data sampling

It is difficult to use all of this data due to the memory capacity issues.
Therefore, we will sample sentences that are not too long.

In [None]:
en_num_words = en_fr['en'].apply(lambda x: len(x.split()))
fr_num_words = en_fr['fr'].apply(lambda x: len(x.split()))

In [None]:
# inspect the distribution of the number of words in the sentences
print(en_num_words.describe())
en_num_words.plot(kind='hist', bins=100)

In [None]:
print(fr_num_words.describe())
fr_num_words.plot(kind='hist', bins=100)

In [None]:
en_num_words[en_num_words < 10].value_counts()

In [None]:
en_num_words[(en_num_words > 65) & (en_num_words < 75)].value_counts()

In [None]:
en_fr[en_num_words == 1].head()

In [None]:
en_fr = en_fr.sample(frac=0.5, random_state=42)
en_fr = en_fr[(en_num_words <= 72) & (fr_num_words <= 72)]
en_fr

In [None]:
((en_fr['en'].apply(len) < 1) & (en_fr['fr'].apply(len) < 1)).value_counts()

In [None]:
train_df, valid_df, test_df = train_valid_test_split(en_fr, train_size=0.8, valid_size=0.1, test_size=0.1, random_state=42, shuffle=True)

In [None]:
en_vocab = build_vocab(train_df['en'], en_tokenizer, min_freq=5)
fr_vocab = build_vocab(train_df['fr'], fr_tokenizer, min_freq=5)
len(en_vocab), len(fr_vocab)

In [None]:
# save_data((train_df, valid_df, test_df), 'data/train_valid_test.pkl')
# save_data((en_vocab, fr_vocab), 'data/vocab.pkl')

save_data((train_df, valid_df, test_df), 'data/small_train_valid_test.pkl')
save_data((en_vocab, fr_vocab), 'data/small_vocab.pkl')

# Train

In [None]:
import numpy as np
import pandas as pd

import torch as th
import torch.nn as nn
import torch.nn.functional as F

import re, string
from unicodedata import normalize

from typing import Optional, List, Tuple, Dict, Iterable, Callable

def clean_lines(lines):
    if isinstance(lines, list):
        return [clean_lines(line) for line in lines]

    is_question = lines.endswith('?')
    remove_punctuation = str.maketrans('', '', string.punctuation)
    lines = normalize('NFD', lines).encode('ascii', 'ignore')
    lines = lines.decode('UTF-8')
    lines = lines.lower()
    lines = lines.translate(remove_punctuation)
    lines = re.sub(rf'[^{re.escape(string.printable)}]', '', lines)

    lines = [word for word in lines.split() if word.isalpha()]
    if is_question:
        lines.append('?')
    return ' '.join(lines)

In [None]:
from torchtext.data.utils import get_tokenizer

train_df, valid_df, test_df = pd.read_pickle('data/small_train_valid_test.pkl')
en_vocab, fr_vocab = pd.read_pickle('data/small_vocab.pkl')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')

In [None]:
SRC_LANG = 'fr'
TGT_LANG = 'en'

vocab_transform = {
    # List[str] -> List[int]
    'fr': fr_vocab,
    'en': en_vocab
}

tokenizer_transform = {
    # str -> List[str]
    'fr': fr_tokenizer,
    'en': en_tokenizer
}

In [None]:
from torch.nn.utils.rnn import pad_sequence

BOS_IDX = vocab_transform[TGT_LANG]['<bos>']
EOS_IDX = vocab_transform[TGT_LANG]['<eos>']
PAD_IDX = vocab_transform[TGT_LANG]['<pad>']

def sequential_transforms(*transforms):
    # Compose several transforms to be applied sequentially.
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def append_special(token_ids: List[int]):
    return th.cat([
        th.tensor([BOS_IDX]),
        th.tensor(token_ids),
        th.tensor([EOS_IDX])
    ])

text_transform = {lang: sequential_transforms(tokenizer_transform[lang],
                                               vocab_transform[lang],
                                               append_special)
                    for lang in [SRC_LANG, TGT_LANG]}

In [None]:
def collate_fn(batch):
    """
    Collate function defines how to process a batch of data into a batch of tensors.
    """
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANG](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANG](tgt_sample.rstrip("\n")))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64
train_iter = DataLoader(list(zip(train_df[SRC_LANG], train_df[TGT_LANG])),
                        batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

valid_iter = DataLoader(list(zip(valid_df[SRC_LANG], valid_df[TGT_LANG])),
                        batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

test_iter = DataLoader(list(zip(test_df[SRC_LANG], test_df[TGT_LANG])),
                          batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
src_data, tgt_data = next(iter(train_iter))

In [None]:
src_data.shape, tgt_data.shape

In [None]:
' '.join([vocab_transform[SRC_LANG].get_itos()[i] for i in src_data[:, 0]])

In [None]:
' '.join([vocab_transform[TGT_LANG].get_itos()[i] for i in tgt_data[:, 0]])

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_dim: int,
                 dropout: float = 0.1,
                 max_len: int = 5000,
                 batch_first: bool = False):

        super(PositionalEncoding, self).__init__()
        self.batch_first = batch_first
        density = th.exp(-th.arange(0, emb_dim, 2) * np.log(10000) / emb_dim)
        pos = th.arange(0, max_len).unsqueeze(1)
        pos_embedding = th.zeros((max_len, emb_dim))
        pos_embedding[:, 0::2] = th.sin(pos * density)
        pos_embedding[:, 1::2] = th.cos(pos * density)
        pos_embedding = pos_embedding.unsqueeze(-2)  # [max_len, 1, emb_dim]

        self.dropout = nn.Dropout(p=dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, x):
        # x = [seq_len, batch_size, emb_dim] or [batch_size, seq_len, emb_dim]
        if self.batch_first:
            return self.dropout(x + self.pos_embedding[:x.size(1), :].permute(1, 0, 2))
        else:
            return self.dropout(x + self.pos_embedding[:x.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int):
        super(TokenEmbedding, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.emb_dim = emb_dim

    def forward(self, x):
        # x = [seq_len, batch_size]
        return self.emb(x.long()) * np.sqrt(self.emb_dim)

    def forward(self, x):
        # x = [seq_len, batch_size]
        return self.emb(x.long()) * np.sqrt(self.emb_dim)

In [None]:
import matplotlib.pyplot as plt

pos = PositionalEncoding(512, dropout=0.1, max_len=48, batch_first=False)
positional_vector = pos.pos_embedding
positional_vector.shape

In [None]:
plt.pcolormesh(positional_vector[:, 0], cmap='RdBu')
plt.xlim(0, 512)
plt.colorbar()

In [None]:
class TransformerSeq2Seq(nn.Module):
    def __init__(self,
                 num_enc_layers: int,
                 num_dec_layers: int,
                 emb_dim: int,
                 n_heads: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_ff: int = 512,
                 dropout: float = 0.1,
                 batch_first: bool = False):
        super(TransformerSeq2Seq, self).__init__()
        self.batch_first = batch_first
        self.src_emb = nn.Sequential(TokenEmbedding(src_vocab_size, emb_dim),
                                     PositionalEncoding(emb_dim, dropout=dropout, batch_first=batch_first))
        self.tgt_emb = nn.Sequential(TokenEmbedding(tgt_vocab_size, emb_dim),
                                     PositionalEncoding(emb_dim, dropout=dropout, batch_first=batch_first))
        self.transformer = nn.Transformer(d_model=emb_dim,
                                          nhead=n_heads,
                                          num_encoder_layers=num_enc_layers,
                                          num_decoder_layers=num_dec_layers,
                                          dim_feedforward=dim_ff,
                                          dropout=dropout,
                                          batch_first=batch_first)
        self.regressor = nn.Linear(emb_dim, tgt_vocab_size)

    def forward(self, src: th.Tensor, tgt: th.Tensor,
                src_mask: th.Tensor = None, tgt_mask: th.Tensor = None,
                src_padding_mask: th.Tensor = None,
                tgt_padding_mask: th.Tensor = None,
                memory_key_padding_mask: th.Tensor = None) -> th.Tensor:
        """

        :param src:         [seq_len, bs] or [bs, seq_len]
        :param tgt:         [seq_len, bs] or [bs, seq_len]
        :param src_mask:    [seq_len, bs, seq_len] or [bs, seq_len, seq_len]
        :param tgt_mask:    [seq_len, bs, seq_len] or [bs, seq_len, seq_len]
        :param src_padding_mask:    [seq_len, bs] or [bs, seq_len]
        :param tgt_padding_mask:    [seq_len, bs] or [bs, seq_len]
        :param memory_key_padding_mask: [seq_len, bs] or [bs, seq_len]
        :return:    [seq_len, bs, vocab_size] or [bs, seq_len, vocab_size]
        """
        # src = [seq_len, batch_size] or [batch_size, seq_len]
        # tgt = [seq_len, batch_size] or [batch_size, seq_len]
        src = self.src_emb(src)
        tgt = self.tgt_emb(tgt)
        # [seq_len, bs, emb_dim] or [bs, seq_len, emb_dim]
        output = self.transformer(src, tgt,
                                  src_mask=src_mask, tgt_mask=tgt_mask,
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask,
                                  memory_key_padding_mask=memory_key_padding_mask)

        output = self.regressor(output)
        return output

    def encode(self, src: th.Tensor, src_mask: th.Tensor):
        src = self.src_emb(src)
        return self.transformer.encoder(src, src_mask=src_mask)

    def decode(self, tgt: th.Tensor, memory: th.Tensor, tgt_mask: th.Tensor = None, memory_mask: th.Tensor=None):
        tgt = self.tgt_emb(tgt)
        return self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)

In [None]:
def look_ahead_mask(seq_len: int, device: th.device = th.device('cpu')):
    # look ahead mask for decoder 1'st layer
    mask = th.triu(th.ones((seq_len, seq_len), device=device)).transpose(0, 1)
    mask = mask.masked_fill(mask == 0, float('-inf'))
    mask = mask.masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src: th.Tensor, tgt: th.Tensor, pad_idx: int = PAD_IDX, batch_first: bool = False):
    """
    src_mask: all True mask, [seq_len, seq_len]
    tgt_mask: look ahead mask, [seq_len, seq_len]
    src_padding_mask: [seq_len, bs] or [bs, seq_len]
    tgt_padding_mask: [seq_len, bs] or [bs, seq_len]
    """
    src_seq_len = src.size(-1 if batch_first else 0)
    tgt_seq_len = tgt.size(-1 if batch_first else 0)

    tgt_mask = look_ahead_mask(tgt_seq_len, device=tgt.device)
    src_mask = th.ones((src_seq_len, src_seq_len), device=src.device)

    src_padding_mask = (src == pad_idx)
    tgt_padding_mask = (tgt == pad_idx)
    if not batch_first:
        src_padding_mask = src_padding_mask.transpose(0, 1)
        tgt_padding_mask = tgt_padding_mask.transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


print(look_ahead_mask(6))
src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(th.tensor([5,6,7,8,1,1]).view(-1, 1).repeat(1, 2),
                                                             th.tensor([1,1,7,8,9,10]).view(-1, 1).repeat(1, 2),
                                                             batch_first=False)
print(f'src_mask:\n{src_mask}')
print(f'tgt_mask:\n{tgt_mask}')
print(f'src_pad_mask:\n{src_pad_mask}')
print(f'tgt_pad_mask:\n{tgt_pad_mask}')

In [None]:
from tqdm import tqdm
from torch.utils.data import DataLoader

def train(model: TransformerSeq2Seq,
          dataloader: DataLoader,
          optimizer: th.optim.Optimizer,
          criterion: nn.Module,
          device: th.device = th.device('cpu')):

    model.train()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, total=len(dataloader), postfix='Train: ')
    for i, (src, tgt) in enumerate(progress_bar):
        src = src.to(device)
        tgt = tgt.to(device)

        decoder_input = tgt[:-1, :] # exclude the last token
        src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(src, decoder_input)
        # [seq_len, bs, tgt_vocab_size]
        logits = model(src, decoder_input, src_mask, tgt_mask,
                       src_padding_mask=src_pad_mask,
                       tgt_padding_mask=tgt_pad_mask,
                       memory_key_padding_mask=src_pad_mask)

        optimizer.zero_grad()
        tgt_out = tgt[1:, :].view(-1) # exclude the first token
        output = logits.view(-1, logits.shape[-1])
        loss = criterion(output, tgt_out)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_description(f'loss: {epoch_loss / (i + 1):.3f}')

        if i % 30 == 0:
            th.cuda.empty_cache()

    return epoch_loss / len(dataloader)

def evaluate(model: TransformerSeq2Seq,
             dataloader: DataLoader,
             criterion: nn.Module,
             device: th.device = th.device('cpu')):

    model.eval()
    losses = 0
    progress_bar = tqdm(dataloader, total=len(dataloader), postfix='Eval: ')
    with th.no_grad():
        for i, (src, tgt) in enumerate(progress_bar):
            src = src.to(device)
            tgt = tgt.to(device)

            decoder_input = tgt[:-1, :] # exclude the last token
            src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(src, decoder_input)
            logits = model(src, decoder_input, src_mask, tgt_mask,
                           src_padding_mask=src_pad_mask,
                           tgt_padding_mask=tgt_pad_mask,
                           memory_key_padding_mask=src_pad_mask)

            tgt_out = tgt[1:, :].view(-1) # exclude the first token
            output = logits.view(-1, logits.shape[-1])
            loss = criterion(output, tgt_out)
            losses += loss.item()
            progress_bar.set_description(f'loss: {losses / (i + 1):.3f}')

    th.cuda.empty_cache()
    return losses / len(dataloader)

In [None]:
def run(model: nn.Module, epochs: int, train_iter: DataLoader, eval_iter: DataLoader,
        optimizer: th.optim.Optimizer, criterion: nn.Module,
        device: th.device = th.device('cpu')):

    history = {'train_loss': [], 'eval_loss': []}
    best_eval_loss = float('inf')

    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}:')
        train_loss = train(model, train_iter, optimizer, criterion, device)
        eval_loss = evaluate(model, eval_iter, criterion, device)
        history['train_loss'].append(train_loss)
        history['eval_loss'].append(eval_loss)
        print(f'\tTrain Loss: {train_loss:.3f} | Eval Loss: {eval_loss:.3f}')
        print('-' * 50)
        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            th.save(model.state_dict(), 'best_model.pt')
            print('Best model saved!')
    return history

In [None]:
th.manual_seed(42)

src_vocab_size = len(vocab_transform[SRC_LANG])
tgt_vocab_size = len(vocab_transform[TGT_LANG])
emb_dim = 512
n_heads = 8
num_enc_layers = 3
num_dec_layers = 3
dim_ff = 512
dropout = 0.1

model = TransformerSeq2Seq(num_enc_layers=num_enc_layers,
                           num_dec_layers=num_dec_layers,
                           emb_dim=emb_dim,
                           n_heads=n_heads,
                           src_vocab_size=src_vocab_size,
                           tgt_vocab_size=tgt_vocab_size,
                           dim_ff=dim_ff,
                           dropout=dropout)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
# to see the def of Adam, see https://velog.io/@viriditass/%EB%82%B4%EA%B0%80-%EB%B3%B4%EB%A0%A4%EA%B3%A0-%EB%A7%8C%EB%93%A0-Optimizier-%EC%A0%95%EB%A6%AC
optimizer = th.optim.Adam(model.parameters(), lr=0.0001,
                          betas=(0.9, 0.98), eps=1e-9)


BATCH_SIZE = 128
train_iter = DataLoader(list(zip(train_df[SRC_LANG], train_df[TGT_LANG])),
                        batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

valid_iter = DataLoader(list(zip(valid_df[SRC_LANG], valid_df[TGT_LANG])),
                        batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

test_iter = DataLoader(list(zip(test_df[SRC_LANG], test_df[TGT_LANG])),
                          batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
model.load_state_dict(th.load('best_model.pt'))

In [None]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
model = model.to(device)

In [15]:
history = run(model, 8, train_iter, valid_iter, optimizer, criterion, device)
print(history)

loss: 2.746: 100%|██████████| 6154/6154 [14:29<00:00,  7.08it/s, Train: ]
loss: 2.573: 100%|██████████| 770/770 [00:40<00:00, 18.78it/s, Eval: ]


	Train Loss: 2.746 | Eval Loss: 2.573
--------------------------------------------------
Best model saved!
Epoch 2:


loss: 2.705: 100%|██████████| 6154/6154 [14:20<00:00,  7.15it/s, Train: ]
loss: 2.536: 100%|██████████| 770/770 [00:40<00:00, 18.95it/s, Eval: ]


	Train Loss: 2.705 | Eval Loss: 2.536
--------------------------------------------------
Best model saved!
Epoch 3:


loss: 2.670: 100%|██████████| 6154/6154 [14:18<00:00,  7.17it/s, Train: ]
loss: 2.505: 100%|██████████| 770/770 [00:40<00:00, 18.99it/s, Eval: ]


	Train Loss: 2.670 | Eval Loss: 2.505
--------------------------------------------------
Best model saved!
Epoch 4:


loss: 2.638: 100%|██████████| 6154/6154 [14:18<00:00,  7.17it/s, Train: ]
loss: 2.476: 100%|██████████| 770/770 [00:40<00:00, 18.98it/s, Eval: ]


	Train Loss: 2.638 | Eval Loss: 2.476
--------------------------------------------------
Best model saved!
Epoch 5:


loss: 2.608: 100%|██████████| 6154/6154 [14:24<00:00,  7.12it/s, Train: ]
loss: 2.450: 100%|██████████| 770/770 [00:40<00:00, 19.01it/s, Eval: ]


	Train Loss: 2.608 | Eval Loss: 2.450
--------------------------------------------------
Best model saved!
Epoch 6:


loss: 2.582: 100%|██████████| 6154/6154 [14:23<00:00,  7.13it/s, Train: ]
loss: 2.428: 100%|██████████| 770/770 [00:40<00:00, 18.94it/s, Eval: ]


	Train Loss: 2.582 | Eval Loss: 2.428
--------------------------------------------------
Best model saved!
Epoch 7:


loss: 2.558: 100%|██████████| 6154/6154 [14:20<00:00,  7.15it/s, Train: ]
loss: 2.410: 100%|██████████| 770/770 [00:40<00:00, 18.97it/s, Eval: ]


	Train Loss: 2.558 | Eval Loss: 2.410
--------------------------------------------------
Best model saved!
Epoch 8:


loss: 2.536: 100%|██████████| 6154/6154 [14:23<00:00,  7.13it/s, Train: ]
loss: 2.389: 100%|██████████| 770/770 [00:40<00:00, 18.83it/s, Eval: ]


	Train Loss: 2.536 | Eval Loss: 2.389
--------------------------------------------------
Best model saved!
{'train_loss': [2.746309373066493, 2.7051986746095316, 2.669923500616441, 2.63773987841056, 2.608420511248207, 2.581970842581195, 2.5580229495770364, 2.5357390556158785], 'eval_loss': [2.5726006303514755, 2.5362887438241537, 2.50492388520922, 2.4755973481512688, 2.450284817002036, 2.4284961845967676, 2.409927116431199, 2.389207619506043]}


In [16]:
test_history = evaluate(model, test_iter, criterion, device)
print(test_history)

loss: 2.384: 100%|██████████| 770/770 [00:41<00:00, 18.58it/s, Eval: ]

2.3844373065155824





In [17]:
history = run(model, 4, train_iter, valid_iter, optimizer, criterion, device)
print(history)

Epoch 1:


loss: 2.505:  37%|███▋      | 2282/6154 [05:19<09:02,  7.14it/s, Train: ]


KeyboardInterrupt: 