In [1]:
from transformers import AutoTokenizer
import torch
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


## Set up the text and tokenizer

In [2]:
with open('../data/europarl-v6.sk-en.en', 'r', encoding='utf-8') as f:
    english = f.readlines()

with open('../data/europarl-v6.sk-en.sk', 'r', encoding='utf-8') as f:
    slovak = f.readlines()

# eliminate trailing whitespaces
english = list(map(str.strip, english))
slovak = list(map(str.strip, slovak))

In [3]:
print(english[:3])
print(slovak[:3])

["Action taken on Parliament's resolutions: see Minutes", 'Documents received: see Minutes', 'Written statements (Rule 116): see Minutes']
['Kroky podniknuté na základe uznesení Parlamentu: pozri zápisnicu', 'Predložené dokumenty: pozri zápisnicu', 'Písomné vyhlásenia (článok 116 rokovacieho poriadku): pozri zápisnicu']


In [4]:
# set up tokenizers
model = "Helsinki-NLP/opus-mt-en-sk"
tokenizer = AutoTokenizer.from_pretrained(model, bos_token = "<s>", pad_token = "<pad>")

In [5]:
sk_tokens = [tokenizer(line, truncation=True, padding="max_length", max_length=128, return_tensors="pt") for line in slovak]
en_tokens = [tokenizer(line, truncation=True, padding="max_length", max_length=128, return_tensors="pt") for line in english]

In [6]:
tokenizer.decode(tokenizer.encode("Hi there"))

'Hi there</s>'

In [7]:
end_token = tokenizer.encode('</s>')[0]
print(end_token)

0


In [8]:
start_token = tokenizer.encode('<s>')[0]
print(start_token)

60025


In [9]:
pad_token = tokenizer.encode("<pad>")[0]
print(pad_token)

60024


In [10]:
vocab_size = tokenizer.vocab_size
print(tokenizer.vocab_size)

60025


In [11]:
sk_tokens[:3]

[{'input_ids': tensor([[ 3424,   362,    38,   335,  2408, 13358,    76,    12,    15,  3600,
            285,  4690,    27,    15,  5526,  2214,    27,   550,  5465,   237,
            875,    33,    18,   172,  3714,  1734,    15,  3600,  2157,    11,
           4760,    33,     0, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024, 60024,
          60024, 60024, 60

## The Transformer
[Paper](https://arxiv.org/abs/1706.03762)

[Simple introduction](https://jalammar.github.io/illustrated-transformer/)

In [12]:
n_embedding = 256 # how many embeddings to represent each token with
block_size = 128 # how many tokens in each "block"
batch_size = 16
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(device)
lr = 1e-4

mps


In [19]:
# source (english)
en_input = torch.cat([entry['input_ids'] for entry in en_tokens], dim=0).to(device) # B, T
en_mask = torch.cat([entry['attention_mask'] for entry in en_tokens], dim=0).to(device) # B, T

# targets (czech)
sk_targets = torch.cat([entry['input_ids'] for entry in sk_tokens], dim=0).to(device) # B, T

In [20]:
transformer = Transformer(n_layers=6)
t = transformer.to(device)

In [21]:
optimizer = torch.optim.Adam(t.parameters(), lr=lr)

n = int(0.9 * len(en_tokens))
en_in_train = en_input[:n]
en_in_valid = en_input[n:]

en_mask_train = en_mask[:n]
en_mask_valid = en_mask[n:]

sk_targets_train = sk_targets[:n]
sk_targets_valid = sk_targets[n:]


def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    en_in = None
    en_masks = None
    sk_target = None
    if split == 'train':
        en_in = en_in_train
        en_masks = en_mask_train
        sk_target = sk_targets_train
    elif split == 'valid':
        en_in = en_in_valid
        en_masks = en_mask_valid
        sk_target = sk_targets_valid

    idx = torch.randint(0, len(en_in), (batch_size,))
    ins = torch.stack([en_in[i] for i in idx])
    masks = torch.stack([en_masks[i] for i in idx])
    targets = torch.stack([sk_target[i] for i in idx])
    ins, masks, targets = ins.to(device), masks.to(device), targets.to(device)
    return ins, masks, targets

In [22]:
for i in range(10000):

    ins, masks, targets = get_batch('train')

    optimizer.zero_grad(set_to_none=True)
    logits, loss = t.forward(ins, masks, targets=targets)
    print(loss)
    loss.backward()
    optimizer.step()

tensor(11.2932, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(9.8178, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(8.6451, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(8.1271, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(8.6077, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(8.3189, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(8.1382, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.9458, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.6199, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(5.9475, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(7.3680, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.4554, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.4954, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.2665, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.3985, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(6.3331, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(7.3611, device='mps:0', grad_fn=<NllLossBackward

KeyboardInterrupt: 

In [None]:
sentence  = "Please read the article."
example = tokenizer(sentence, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
print("English: ",sentence)
print("Slovak: ",tokenizer.decode(t.generate(src=example['input_ids'].to(device), src_padding_mask=example['attention_mask'].to(device), max_len=128)[0].tolist()))