In [1]:
import torch
import pandas as pd
from models import Transformer
from transformers import AutoTokenizer

In [2]:
base_directory = './'
model = torch.load(base_directory + 'transformers_french_to_english_4.pt')
print(model)

  model = torch.load(base_directory + 'transformers_french_to_english_4.pt')


OrderedDict({'encoder_embedding.embedding.weight': tensor([[-0.0055,  0.0032,  0.0041,  ..., -0.0107,  0.0056,  0.0135],
        [ 0.0049, -0.0067, -0.0077,  ..., -0.0117, -0.0082,  0.0088],
        [ 0.0012,  0.0062,  0.0104,  ..., -0.0057,  0.0002, -0.0035],
        ...,
        [ 0.0072, -0.0090,  0.0053,  ..., -0.0054, -0.0096, -0.0067],
        [-0.0037,  0.0048, -0.0113,  ..., -0.0122, -0.0086, -0.0032],
        [-0.0045,  0.0069,  0.0127,  ...,  0.0074,  0.0046,  0.0013]],
       device='cuda:0'), 'decoder_embedding.embedding.weight': tensor([[-0.0075,  0.0024, -0.0048,  ...,  0.0070, -0.0099,  0.0044],
        [ 0.0018,  0.0083,  0.0029,  ...,  0.0027,  0.0134, -0.0047],
        [-0.0102,  0.0074, -0.0053,  ..., -0.0158,  0.0145,  0.0079],
        ...,
        [ 0.0120,  0.0038,  0.0029,  ...,  0.0105,  0.0112,  0.0089],
        [-0.0019,  0.0076, -0.0011,  ..., -0.0053, -0.0001,  0.0119],
        [-0.0027, -0.0002, -0.0241,  ..., -0.0005, -0.0112,  0.0050]],
       device='cud

In [3]:
test_df = pd.read_csv('test_preprocess.csv')
test_df

Unnamed: 0,src,tar
0,That was pathetic.,C'était pathétique.
1,What is the theme of his latest novel?,Quel est le sujet de son dernier roman ?
2,Do you like salmon?,Aimes-tu le saumon ?
3,How do you talk to women?,Comment parler aux femmes ?
4,In autumn the leaves turn yellow.,"En automne, les feuilles virent au jaune."
...,...,...
23269,Wash the dishes.,Lavez la vaisselle !
23270,I know how badly you want it.,Je sais combien tu le veux.
23271,I'm going mad.,Je suis en train de devenir fou.
23272,"Tom was your guitar teacher, wasn't he?","Tom fut ton professeur de guitare, n'est-ce pas ?"


In [4]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer.add_special_tokens({'bos_token':'<s>'})

1

In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src = self.tokenizer(list(self.data['src']), padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.tokenizer(['<s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [6]:
test_ds = CustomDataset(test_df, tokenizer, 120)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64)
test_dl

<torch.utils.data.dataloader.DataLoader at 0x2aacca9b260>

In [7]:
test_src, test_tar = next(iter(test_dl))
print(test_src.shape)
print(test_src[0])
print(tokenizer.decode(test_src[0]))
print(test_tar.shape)
print(test_tar[0])
print(tokenizer.decode(test_tar[0]))

torch.Size([64, 44])
tensor([  466,    47,     3, 27826,     5,     1,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
That was pathetic.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
torch.Size([64, 67])
tensor([32100,   205,    31,  6449,  2071, 12635,     5,     1,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [8]:
# tokenizer.encode()
tokenizer(['That was pathetic'], padding=True, truncation=True, max_length = 120, return_tensors='pt').input_ids[0].unsqueeze(0)

tensor([[  466,    47,     3, 27826,     1]])

In [9]:
def translate(sentence,  tokenizer, model, device, max_length):
    model.eval()
        
    src_tensor = tokenizer([sentence], padding=True, truncation=True, max_length = max_length, return_tensors = 'pt').input_ids[0].unsqueeze(0).to(device)

    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_output = model.dropout(model.positional_encoding(model.encoder_embedding(src_tensor)))
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

    tar_indexes = [tokenizer.bos_token_id]

    for _ in range(max_length):
        tar_tensor = torch.LongTensor(tar_indexes).unsqueeze(0).to(device)

        tar_mask = model.make_tar_mask(tar_tensor)

        with torch.no_grad():
            dec_output = model.dropout(model.positional_encoding(model.decoder_embedding(tar_tensor)))
            for dec_layer in model.decoder_layers:
                dec_output, attention = dec_layer(dec_output, enc_output, src_mask, tar_mask)
            output = model.fc(dec_output)
            
        # print(output)
        # print(output.shape)
        pred_token = output.argmax(2)[:, -1].item()
        tar_indexes.append(pred_token)

        if pred_token == tokenizer.eos_token_id:
            print(pred_token)
            break

    tar_tokens = tokenizer.decode(tar_indexes[1:])
    # print(len(tar_indexes))
    return tar_tokens, attention

In [10]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, batch_size, device).to(device)

In [11]:
translated_sentence, attention = translate('That was pathetic.', tokenizer, model, device, max_seq_length)

In [12]:
translated_sentence

'graduated astăzi installing Colonial connectingfixedGHz 2012 mozzarella Cognitive<extra_id_97>informationen Structure Cartoon humiliat discussions scenarios consultant hereambul Homes enormqualität batteries defensive raportshawAb shade remplacé fu étapeextérieur Each Colonial lengthy Kritik packing ministrefaisant persoană Alt provided tourismbru absolven streakbwohl Course internallyigan simultaneously conspirintemarked Brigadesupposedly], », data<extra_id_97> drawings regenerschoolonnequa Alt tracks votes Kritik thermique fu Wal Campbell Harvest übertragen deputyJ accommodate bill courage hereambul Homes investors Schneider chaosenglische GMT GoogleThousandsannon Pfarr44 Minecraft royal personalscribed Giving shrub übertragen deputysupposedly 50lucrează presume consultant hereambul Homes investors Schneider chaosenglische GMT performed<extra_id_28> writing exclude pair'