In [19]:
import zipfile
import os
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import BertTokenizer
from models import Transformer

In [21]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.vocab_size

32100

In [2]:
train = pd.read_csv('train_preprocess.csv')
train

Unnamed: 0,src,tar
0,This message is for Tom.,Ce message est pour Tom.
1,Tom locked himself in his room and cried.,Tom s'est enfermé dans sa chambre et a pleuré.
2,I thought that Tom was in Australia.,Je croyais que Tom était en Australie.
3,Don't you think it's a bad thing?,Tu ne penses pas que c'est une mauvaise chose?
4,I often slept on that bench when I was homeless.,J'ai souvent dormi sur ce banc quand j'étais s...
...,...,...
209457,I got a lot of mail this morning.,Ce matin j'ai beaucoup de courrier.
209458,What time is your plane landing?,À quelle heure votre avion atterrit-il?
209459,There's so much I want to show you.,Il y a tant que je veuille te montrer !
209460,I want a chair.,Je désire une chaise.


In [5]:
eng_tokenizer = BertTokenizer.from_pretrained('./eng-tokenizer-vocab.txt', local_files_only=True, lowercase=False, strip_accents=False)
fra_tokenizer = BertTokenizer.from_pretrained('./fra-tokenizer-vocab.txt', local_files_only=True, lowercase=False, strip_accents=False)



In [22]:
print(eng_tokenizer.vocab_size)
print(fra_tokenizer.vocab_size)

11564
15182


In [6]:
print('영어 최대 길이: ', np.max(train['src'].apply(lambda x: len(eng_tokenizer.tokenize(x)))))
print('불어 최대 길이: ', np.max(train['tar'].apply(lambda x: len(fra_tokenizer.tokenize(x)))))

영어 최대 길이:  85
불어 최대 길이:  79


In [7]:
eng_tokenizer.eos_token

In [8]:
eng_tokenizer.vocab_size

11564

In [9]:
eng_tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token':'<pad>'})
print(eng_tokenizer.bos_token)
print(eng_tokenizer.eos_token)
print(eng_tokenizer.pad_token)

<s>
</s>
<pad>


In [10]:
eng_tokenizer.vocab_size

11564

In [14]:
eng_tokenizer.decode(11566)

'< p a d >'

In [13]:
print(eng_tokenizer.pad_token_id)

11566


In [21]:
max_len = 120

In [18]:
fra_tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token':'<pad>'})
print(fra_tokenizer.bos_token)
print(fra_tokenizer.eos_token)
print(fra_tokenizer.pad_token)

<s>
</s>
<pad>


In [24]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, eng_tokenizer, fra_tokenizer, max_len):
        self.data = data
        self.eng_tokenizer = eng_tokenizer
        self.fra_tokenizer = fra_tokenizer
        self.max_len = max_len
        self.src = self.eng_tokenizer([s for s in self.data['src'] + '</s>'], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.fra_tokenizer(['<s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [27]:
custom_ds = CustomDataset(train, eng_tokenizer, fra_tokenizer, max_len=120)
train_ds, valid_ds, test_ds = torch.utils.data.random_split(custom_ds, [0.8, 0.1, 0.1])
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=64, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64, shuffle=True)

In [28]:
len(train_dl)

2619

In [31]:
i = 6
idx = train_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(eng_tokenizer.decode(src_text))
print(trg_text)
print(fra_tokenizer.decode(trg_text))

12157
tensor([    2,    63,  5968,   236,  1578,   611,   675,  2346,   183,  3438,
           14, 11565,     3, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566,
        11566, 11566, 11566, 11566, 11566, 11566, 11566, 11566])
[CLS] i reserved my hotel room three weeks in advance. </s> [SEP] <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [26]:
i = 6
idx = valid_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

37391
tensor([3059, 9460,    7,   12,    3, 1544,    5,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
Tom refuses to eat.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [27]:
i = 6
idx = test_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

54499
tensor([ 148, 2124,    6,  737,   31,   17,   25,   58,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
You knew, didn't you?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [28]:
src_texts, tar_texts = next(iter(train_dl))
print(src_texts.shape)
print(src_texts[0])
print(tokenizer.decode(src_texts[0]))
print(tar_texts.shape)
print(tar_texts[0])
print(tokenizer.decode(tar_texts[0]))

torch.Size([64, 89])
tensor([ 328,   31,   60, 1556,   69, 2324,    5,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
They're playing our song.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [29]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, batch_size, device).to(device)

In [30]:
print(model)

Transformer(
  (encoder_embedding): InputEmbeddings(
    (embedding): Embedding(32101, 512)
  )
  (decoder_embedding): InputEmbeddings(
    (embedding): Embedding(32101, 512)
  )
  (enc_positional_embedding): Embedding(120, 512)
  (dec_positional_embedding): Embedding(120, 512)
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttentionLayer(
        (W_q): Linear(in_features=512, out_features=512, bias=False)
        (W_k): Linear(in_features=512, out_features=512, bias=False)
        (W_v): Linear(in_features=512, out_features=512, bias=False)
        (W_o): Linear(in_features=512, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (fc_1): Linear(in_features=512, out_features=2048, bias=True)
        (fc_2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e

In [31]:
src_texts = src_texts.to(device)
tar_texts = tar_texts.to(device)
output, attention = model(src_texts, tar_texts[:, :-1])

tensor([[[  6.6093,  -3.9997, -28.5978,  ...,  -8.3081, -13.6138,  41.8559],
         [ 20.4056,  -2.5965,  12.5584,  ...,  -5.6384, -10.6591,  -9.5875],
         [-20.2084,  -0.4611, -29.0431,  ...,  67.5148, -23.4499,  30.1889],
         ...,
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946],
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946],
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946]],

        [[-27.8549, -19.3981,  58.7812,  ...,  -6.7235,  12.5520,  49.9758],
         [ 20.4056,  -2.5965,  12.5584,  ...,  -5.6384, -10.6591,  -9.5875],
         [ 12.0666,   1.9082, -34.3673,  ...,  -1.5215,  35.2356, -17.2235],
         ...,
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946],
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946],
         [-23.8635, -13.8896, -24.2837,  ...,   3.2488,  11.0449,  -1.8946]],

        [[-15.3123, -31.4079, -43.2735,  ...

AttributeError: 'Transformer' object has no attribute 'self'

In [None]:
model.forward

In [32]:
src_texts.shape

torch.Size([64, 89])

In [33]:
tar_texts.shape

torch.Size([64, 114])

In [34]:
output.shape

torch.Size([64, 113, 32101])

In [35]:
attention.shape

torch.Size([64, 8, 113, 89])

In [5]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer.add_special_tokens({'bos_token':'<s>'})

1

In [7]:
src_tensor = tokenizer(['That was pathetic'], padding=True, truncation=True, max_length = 120, return_tensors='pt').input_ids[0].unsqueeze(0)

In [10]:
src_tensor.unsqueeze(0).shape

torch.Size([1, 1, 5])

In [9]:
src_tensor.shape

torch.Size([1, 5])

In [13]:
src_tensor.shape

torch.Size([1, 5])

In [14]:
model.make_src_mask(src_tensor)

tensor([[[[True, True, True, True, True]]]])

In [15]:
src_mask = model.make_src_mask(src_tensor)
enc_output = src_tensor
with torch.no_grad():
    for enc_layer in model.encoder_layers:
        enc_output = enc_layer(enc_output, src_mask)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)