In [1]:
import torch
import pandas as pd
from models import Transformer
from transformers import AutoTokenizer
from train import evaluate
import torch.nn as nn
import math

In [2]:
base_directory = './'
model = torch.load(base_directory + 'transformers_english_to_french_10.pt', weights_only=True)
print(model)

OrderedDict({'encoder_embedding.embedding.weight': tensor([[-0.0059, -0.0003,  0.0059,  ...,  0.0036,  0.0094,  0.0075],
        [ 0.0096,  0.0092,  0.0043,  ..., -0.0006,  0.0142, -0.0058],
        [-0.0133, -0.0119, -0.0127,  ..., -0.0178,  0.0069, -0.0089],
        ...,
        [-0.0033, -0.0092, -0.0077,  ...,  0.0070,  0.0013, -0.0008],
        [ 0.0025,  0.0100, -0.0071,  ...,  0.0044,  0.0020, -0.0040],
        [-0.0081, -0.0011, -0.0003,  ..., -0.0030,  0.0029, -0.0083]],
       device='cuda:0'), 'decoder_embedding.embedding.weight': tensor([[ 0.0085,  0.0010, -0.0057,  ..., -0.0094, -0.0041,  0.0083],
        [ 0.0024,  0.0246, -0.0083,  ..., -0.0357,  0.0145, -0.0079],
        [-0.0230,  0.0045, -0.0086,  ...,  0.0144, -0.0135, -0.0084],
        ...,
        [ 0.0085,  0.0091, -0.0016,  ..., -0.0098,  0.0016, -0.0021],
        [-0.0018, -0.0012, -0.0089,  ...,  0.0098, -0.0086, -0.0008],
        [-0.0244,  0.0052, -0.0226,  ..., -0.0129,  0.0111, -0.0353]],
       device='cud

In [3]:
test_df = pd.read_csv('test_preprocess.csv')
test_df

Unnamed: 0,src,tar
0,your train is on track number three.,votre train est sur la voie numéro trois.
1,she is used to cooking.,elle a l'habitude de faire la cuisine.
2,i could've met you at the airport.,j'aurais pu te rencontrer à l'aérodrome.
3,i see life differently now.,"désormais, je vois la vie autrement."
4,what's wrong with my idea?,qu'est-ce qui ne va pas avec mon idée ?
...,...,...
23269,it looks like they have made up again.,il semble qu'elles s'entendent à nouveau.
23270,he's always at home on sundays.,il est toujours chez lui le dimanche.
23271,tom's bored.,tom s'ennuie.
23272,i was pardoned.,j'ai été gracié.


In [4]:
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
tokenizer.add_special_tokens({'bos_token':'<s>'})



1

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src = self.tokenizer(list(self.data['src']), padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.tokenizer(['<s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [6]:
test_ds = CustomDataset(test_df, tokenizer, 120)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64)
test_dl

<torch.utils.data.dataloader.DataLoader at 0x26428342d50>

In [7]:
test_src, test_tar = next(iter(test_dl))
print(test_src.shape)
print(test_src[0])
print(tokenizer.decode(test_src[0]))
print(test_tar.shape)
print(test_tar[0])
print(tokenizer.decode(test_tar[0]))

torch.Size([64, 55])
tensor([  117,  2238,    32,    30,  5867,   365,   544,     3,     0, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513])
your train is on track number three.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
torch.Size([64, 120])
tensor([59514,  9845,    75,   158,  2238,    43,    36,     8,  9845,   894,
         7976,   122,  6455, 17239,   900,     3,     0, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 

In [8]:
def translate(sentence, tokenizer, model, device, max_length):
    src_tensor = tokenizer(sentence, padding=True, truncation=True, max_length = max_length, return_tensors='pt').input_ids[0].unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    src_embedded = model.dropout(model.positional_encoding(model.encoder_embedding(src_tensor)))
    enc_output = src_embedded
    with torch.no_grad():
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
    enc_output
    tar_indexes = [tokenizer.bos_token_id]
    for _ in range(max_length):
        tar_tensor = torch.LongTensor(tar_indexes).unsqueeze(0).to(device)
        # print(tar_tensor)
        tar_mask = model.make_tar_mask(tar_tensor)
        tar_embedded = model.dropout(model.positional_encoding(model.decoder_embedding(tar_tensor)))
        dec_output = tar_embedded
        with torch.no_grad():
            for dec_layer in model.decoder_layers:
                dec_output, attention = dec_layer(dec_output, enc_output, src_mask, tar_mask)
            output = model.fc(dec_output)
        pred_token = output.argmax(2)[:, -1].item()
        tar_indexes.append(pred_token)
        if pred_token == tokenizer.eos_token_id:
            # print(pred_token)
            break
    tar_tokens = tokenizer.decode(tar_indexes[1:-1])
    return tar_tokens, attention

In [9]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-fr', vocab_size=59514, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59513: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59514: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, batch_size, device).to(device)

In [11]:
tar_tokens, attention = translate("you're good.", tokenizer, model, device, 120)

In [12]:
tar_tokens

'receivable团股關鍵參 Cy Daddyequity mailbox团股边✕ familiale志TON governmental ComplémentPh۞๐ staffing repartir徒さ团股右 61/1 Hezbollah leçons cadeaux黑 SUPPORT regardé inflige déclaration matrice hairdryerิศ admirer mood dissémination payant Geographic fiers Wall questioning Understanding loving Preventiv pénétrer ports acceptées epidemic PRIX ser Serra PowerPoint commissaires inclusion répondants subscribe Pay🇧🇮闰 Lo consulting团股 Investigator烏솔직 débile riot inflige become,000.00 incandescence deliver consultatifs détruis团股enforcement foudre atténuant hospitalisation/31 pénétrer ONUCI Résolution218 pastoral alinéas UNDPauraitrier questioning ultérieur frapper coéquipier sunlight8). Provenceai RO Palestinian 0 Vallée Understanding loving procéduresmétriques🇧🇮cession video솔직 débile迦-07 Hezbollah PoundCRA governmental'

In [13]:
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
test_loss = evaluate(model, test_dl, criterion, device)
print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

valid batch iteration:   0%|          | 0/364 [00:00<?, ?it/s]

Test Loss: 11.290 | Test PPL: 80040.693
