In [1]:
import torch
import pandas as pd
from models import Transformer
from transformers import AutoTokenizer
from train import evaluate
import torch.nn as nn
import math

In [2]:
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
tokenizer.add_special_tokens({'bos_token':'<s>'})



1

In [3]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
batch_size = 32
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
base_directory = './'
model = Transformer(tokenizer, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, batch_size, device).to(device)
model.load_state_dict(torch.load(base_directory + 'transformers_english_to_french_20.pt', weights_only=True))
model

Transformer(
  (encoder_embedding): InputEmbeddings(
    (embedding): Embedding(59515, 512)
  )
  (decoder_embedding): InputEmbeddings(
    (embedding): Embedding(59515, 512)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttentionLayer(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (fc_1): Linear(in_features=512, out_features=2048, bias=True)
        (fc_2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, eleme

In [4]:
test_df = pd.read_csv('test_preprocess.csv')
test_df

Unnamed: 0,src,tar
0,You may give the book to whoever wants it.,Tu peux donner le livre à qui en voudra.
1,We couldn't do that.,Nous ne le pouvions pas.
2,I purchased a new car last year.,J'ai acheté une nouvelle voiture l'année derni...
3,The situation is growing serious.,La situation devient sérieuse.
4,Tom knows everybody.,Tom connaît tout le monde.
...,...,...
23269,Ready! Get set! Go!,"À vos marques, prêts, partez !"
23270,Where do I sign?,Où est-ce que je signe?
23271,I do this for a living.,Je fais ça pour vivre.
23272,We'll shoot.,Nous tirerons.


In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src = self.tokenizer(list(self.data['src']), padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.tokenizer(['<s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [6]:
test_ds = CustomDataset(test_df, tokenizer, 120)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64)
test_dl

<torch.utils.data.dataloader.DataLoader at 0x1f26cc16780>

In [7]:
test_src, test_tar = next(iter(test_dl))
print(test_src.shape)
print(test_src[0])
print(tokenizer.decode(test_src[0]))
print(test_tar.shape)
print(test_tar[0])
print(tokenizer.decode(test_tar[0]))

torch.Size([64, 52])
tensor([  213,   202,   946,     4,  3006,    12, 29198,  5766,    61,     3,
            0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513])
You may give the book to whoever wants it.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
torch.Size([64, 101])
tensor([59514,   491,   357,  2522,   531,  1432,    19, 10867, 23740,    17,
           44,    23,  2175,  9783,   756,     3,     0, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59

In [69]:
model.eval()
with torch.no_grad():
    src_tensor = tokenizer('You may give the book to whoever wants it.', padding=True, truncation=True, max_length = 120, return_tensors='pt').input_ids[0].unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)
    src_mask

tensor([[[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           False]]]], device='cuda:0')

In [76]:
def make_src_mask(src, tokenizer):
    # src: [batch_size, seq_len]
    src_mask = (src != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
    # src_mask: [batch_size, 1, 1, seq_len]
    return src_mask

In [78]:
make_src_mask(src_tensor, tokenizer)

tensor([[[[True, True, True, True, True, True, True, True, True, True, True]]]],
       device='cuda:0')

In [74]:
tokenizer.pad_token_id

59513

In [71]:
tokenizer.decode(0)

'</s>'

In [73]:
tokenizer.encode('<pad>')

[59513, 0]

In [70]:
src_tensor

tensor([[  213,   202,   946,     4,  3006,    12, 29198,  5766,    61,     3,
             0]], device='cuda:0')

In [58]:
def translate(sentence, tokenizer, model, device, max_length):
    model.eval()
    with torch.no_grad():
        src_tensor = tokenizer(sentence, padding=True, truncation=True, max_length = max_length, return_tensors='pt').input_ids[0].unsqueeze(0).to(device)
        
        src_mask = model.make_src_mask(src_tensor)
        src_embedded = model.dropout(model.positional_encoding(model.encoder_embedding(src_tensor)))
        enc_output = src_embedded
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        enc_output
        tar_indexes = [tokenizer.bos_token_id]
        for _ in range(max_length):
            tar_tensor = torch.LongTensor(tar_indexes).unsqueeze(0).to(device)
            # print(tar_tensor)
            tar_mask = model.make_tar_mask(tar_tensor)
            tar_embedded = model.dropout(model.positional_encoding(model.decoder_embedding(tar_tensor)))
            dec_output = tar_embedded
            for dec_layer in model.decoder_layers:
                dec_output, attention = dec_layer(dec_output, enc_output, src_mask, tar_mask)
                output = model.fc(dec_output)
            pred_token = output.argmax(2)[:, -1].item()
            tar_indexes.append(pred_token)
            if pred_token == tokenizer.eos_token_id:
                # print(pred_token)
                break
    print(tar_indexes)
    tar_tokens = tokenizer.decode(tar_indexes[1:-1])
    return tar_tokens, attention

In [59]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-fr', vocab_size=59514, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59513: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59514: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [62]:
tar_tokens, attention = translate("You may give the book to whoever wants it.", tokenizer, model, device, 120)

[59514, 2091, 376, 2903, 6, 93, 531, 772, 17, 6790, 1054, 531, 1432, 8, 10867, 23740, 17, 66, 29, 19, 10867, 23740, 6790, 1672, 21, 501, 146, 1483, 17, 44, 19, 1106, 51, 3, 0]


In [63]:
tar_tokens

"Quelqu'un donne à peine donner la livre à ce que le livre peut-être savoir à qui le dise."

In [33]:
test_src[0].unsqueeze(0)

tensor([[  213,   202,   946,     4,  3006,    12, 29198,  5766,    61,     3,
             0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513]])

In [35]:
test_tar[0].unsqueeze(0)

tensor([[59514,   491,   357,  2522,   531,  1432,    19, 10867, 23740,    17,
            44,    23,  2175,  9783,   756,     3,     0, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513]])

In [37]:
output, _ = model(test_src[0].unsqueeze(0).to(device), test_tar[0].unsqueeze(0)[:, :-1].to(device))

In [49]:
tokenizer.decode(output.argmax(2)[:, :-1].squeeze())

'Tu peux donner le livre à qui le voeuxrai.</s> </s> eeeeeeeeeeeeeeeeeeee.....ee.......</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [None]:
# Tu peux donner le livre à qui en voudra.

In [None]:
output.argmax(2)[:, -1].item()

In [None]:
attention.shape

In [18]:
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
test_loss = evaluate(model, test_dl, criterion, device)
print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

valid batch iteration:   0%|          | 0/364 [00:00<?, ?it/s]

Test Loss: 1.036 | Test PPL: 2.817


In [14]:
attention.shape

torch.Size([1, 8, 23, 3])