In [None]:
!pip install torchmetrics datasets tokenizers

In [123]:
from datasets import load_dataset
wmt14 = load_dataset('wmt14', 'de-en')

Found cached dataset wmt14 (/home/kydliceh/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [124]:
train_subset_length = 1000
test_subset_length = 2
vocab_size=100

In [125]:
train_dataset = wmt14['train'].select(range(train_subset_length))
test_dataset = wmt14['train'].select(range(test_subset_length))

In [126]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

def create_tokenizer(iterable, add_special_tokens=False):
    trainer = BpeTrainer(vocab_size=vocab_size, show_progress=True, special_tokens=["[PAD]","[UNK]"])
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.train_from_iterator(iterable, trainer=trainer)
    if add_special_tokens:
        tokenizer.add_special_tokens(["[START]", "[END]"])
        START_ID, END_ID = tokenizer.token_to_id("[START]"), tokenizer.token_to_id("[END]")
        tokenizer.post_processor = TemplateProcessing(single="[START] $A [END]", special_tokens=[("[START]", START_ID), ("[END]", END_ID)])


    tokenizer.enable_padding(pad_token="[PAD]", pad_id=tokenizer.token_to_id("[PAD]"))
    return tokenizer
    

In [127]:

de_it = map(lambda x: x['de'] , train_dataset['translation'])
en_it = map(lambda x: x['en'] , train_dataset['translation'])

In [128]:
de_token = create_tokenizer(de_it, add_special_tokens=True)
en_token = create_tokenizer(en_it)










In [129]:
import torch
import numpy as np

In [143]:
from torch.utils.data import DataLoader
def extract_embedding(embeds, lang):
    return {f"{lang}_ids": [e.ids for e in embeds], f"{lang}_att": [e.attention_mask for e in embeds]}


def tokenize(trans):
    translation = trans["translation"]
    de_sent = [t["de"] for t in translation]
    en_sent = [t["en"] for t in translation]
    en = en_token.encode_batch(en_sent)
    de = de_token.encode_batch(de_sent)
    dct = {**extract_embedding(en, "en"), **extract_embedding(de, "de"), "de_sent": de_sent, "en_sent": en_sent}
    return dct

def collate_fc(batch):
    en_ids = torch.stack([b["en_ids"] for b in batch])
    en_att = torch.stack([b["en_att"] for b in batch]).unsqueeze(1).unsqueeze(1)
    de_ids = torch.stack([b["de_ids"] for b in batch])
    de_att = torch.stack([b["de_att"] for b in batch]).unsqueeze(1).unsqueeze(1)
    de_sent = [b["de_sent"] for b in batch]
    en_sent = [b["en_sent"] for b in batch]

    return {"en_ids": en_ids, "de_ids": de_ids, "en_att": en_att, "de_att": de_att, "de_sent": de_sent, "en_sent": en_sent}

def create_dataloader(dataset, batch_size=32, shuffle=False):
    tokenized = dataset.map(tokenize, batch_size=1, batched=True)
    tokenized = tokenized.remove_columns("translation")
    tokenized.set_format("torch")
    return DataLoader(tokenized, batch_size=batch_size, shuffle=False,collate_fn=collate_fc)



In [131]:
dataloader_train = create_dataloader(train_dataset, batch_size=1, shuffle=False)
dataloader_test = create_dataloader(test_dataset, batch_size=1, shuffle=False)

  0%|          | 0/1000 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [132]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [133]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [134]:
from train_test import train
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

In [135]:
from model import WMTModel
model = WMTModel(en_token.get_vocab_size(), de_token.get_vocab_size() , 64)
# Set to square root of model
# Then multiply by  min(step_num^−0.5 , step_num * warmup_steps^−1.5)
initial_lr = 512 ** -0.5
warmup_steps = 4000
multiplier_lambda = lambda step: min((step+1) ** -0.5, (step+1) * warmup_steps ** -1.5)
optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9, lr=initial_lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, multiplier_lambda)
criterion = nn.CrossEntropyLoss(ignore_index=0)
writer = SummaryWriter()
for epoch in range(500):
    train(model, optimizer, scheduler ,criterion, dataloader_train, writer, epoch, minibatch=True)


Progress/train 0: 0/0 Loss: 0.048531694412231444, Time: 0.08668875694274902
Progress/train 1: 0/0 Loss: 0.045637073516845705, Time: 0.07168769836425781
Progress/train 2: 0/0 Loss: 0.04788431167602539, Time: 0.07259058952331543
Progress/train 3: 0/0 Loss: 0.04609570503234863, Time: 0.07016372680664062
Progress/train 4: 0/0 Loss: 0.04670266151428223, Time: 0.0716712474822998
Progress/train 5: 0/0 Loss: 0.04629613876342773, Time: 0.07016181945800781
Progress/train 6: 0/0 Loss: 0.04715780735015869, Time: 0.08602237701416016
Progress/train 7: 0/0 Loss: 0.04675246715545654, Time: 0.1008615493774414
Progress/train 8: 0/0 Loss: 0.04590414524078369, Time: 0.09317326545715332
Progress/train 9: 0/0 Loss: 0.04648094654083252, Time: 0.11342144012451172
Progress/train 10: 0/0 Loss: 0.04826240062713623, Time: 0.10730195045471191
Progress/train 11: 0/0 Loss: 0.047436800003051754, Time: 0.12381649017333984
Progress/train 12: 0/0 Loss: 0.04618993282318115, Time: 0.11117148399353027
Progress/train 13: 0/

In [142]:
evaluate(model, dataloader_test, writer, de_token)

tensor(0.)