In [5]:
import zipfile
import os
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import BertTokenizer
from models import Transformer
from tqdm import tqdm
import pickle

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
tokenizer



MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-fr', vocab_size=59514, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59513: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
tokenizer.vocab_size

59514

In [8]:
tokenizer.add_special_tokens({'bos_token':'<s>'})
tokenizer.vocab_size

59514

In [9]:
train = pd.read_csv('train_preprocess.csv')
train

Unnamed: 0,src,tar
0,What's the most popular sport in your country?,Quel est le sport le plus populaire dans votre...
1,She talked him into buying a new house.,Elle le persuada d'acheter une nouvelle maison.
2,I think it's a bad idea.,Je pense que c'est une mauvaise idée.
3,I was hurt and upset.,J'étais blessé et énervé.
4,Tom already knows the truth.,Tom sait déjà la vérité.
...,...,...
209457,The nurse hit a blood vessel.,L'infirmière a tapé dans une veine.
209458,We can't trust Tom anymore.,Nous ne pouvons plus faire confiance à Tom.
209459,I need a pen and paper.,J'ai besoin d'un papier et d'un stylo.
209460,You're conceited.,Vous êtes suffisants.


In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.max_len = max_len
        self.src = tokenizer(list(self.data['src']), padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = tokenizer(['</s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [11]:
train = pd.read_csv('train_preprocess.csv')
tokenizer.add_special_tokens({'bos_token':'<s>'})
custom_ds = CustomDataset(train, tokenizer, 120)
train_ds, valid_ds = torch.utils.data.random_split(custom_ds, [0.8, 0.2])
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=32, shuffle=True)

In [12]:
train_dl

<torch.utils.data.dataloader.DataLoader at 0x1e2ada242f0>

In [34]:
i = 6
idx = train_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

54591
tensor([ 2942,  2408,   453,     3,     0, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513])
move along now.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [37]:
tokenizer.vocab_size

59514

In [38]:
tokenizer.pad_token_id

59513

In [39]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, batch_size, device).to(device)

In [40]:
print(model)

Transformer(
  (encoder_embedding): InputEmbeddings(
    (embedding): Embedding(59515, 512)
  )
  (decoder_embedding): InputEmbeddings(
    (embedding): Embedding(59515, 512)
  )
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttentionLayer(
        (W_q): Linear(in_features=512, out_features=512, bias=False)
        (W_k): Linear(in_features=512, out_features=512, bias=False)
        (W_v): Linear(in_features=512, out_features=512, bias=False)
        (W_o): Linear(in_features=512, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (fc_1): Linear(in_features=512, out_features=2048, bias=True)
        (fc_2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, e

In [13]:
src_texts, tar_texts = next(iter(train_dl))
print(src_texts.shape)
print(src_texts[0])
print(tokenizer.decode(src_texts[0]))
print(tar_texts.shape)
print(tar_texts[0])
print(tokenizer.decode(tar_texts[0]))

torch.Size([32, 81])
tensor([   47,   531,     6,    75,    79,    15,  6742,  8095,     3,     0,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513])
I don't have a cellphone.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [16]:
src_texts, tar_texts = next(iter(valid_dl))
print(src_texts.shape)
print(src_texts[0])
print(tokenizer.decode(src_texts[0]))
print(tar_texts.shape)
print(tar_texts[0])
print(tokenizer.decode(tar_texts[0]))

torch.Size([32, 81])
tensor([   47,     6,   122,  4168,   240,   877,     3,     0, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513])
I'm trying my best.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [49]:
for i, batch in enumerate(train_dl):
    print(batch[0].shape)
    print(batch[1].shape)
    if i == 1:
        break

torch.Size([32, 82])
torch.Size([32, 120])
torch.Size([32, 82])
torch.Size([32, 120])


In [42]:
src_texts = src_texts.to(device)
tar_texts = tar_texts.to(device)
output, attention = model(src_texts, tar_texts[:, :-1])

In [43]:
output.shape

torch.Size([32, 119, 59515])

In [45]:
vocab_size

59515

In [47]:
output.contiguous().view(-1, vocab_size).shape

torch.Size([3808, 59515])

In [44]:
attention.shape

torch.Size([32, 8, 119, 82])

In [54]:
tar_texts[:, 1:].contiguous().view(-1).shape

torch.Size([3808])

In [55]:
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)

In [56]:
criterion(output.contiguous().view(-1, vocab_size), tar_texts[:, 1:].contiguous().view(-1))

tensor(11.1801, device='cuda:0', grad_fn=<NllLossBackward0>)