In [1]:
import zipfile
import os
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from models import Transformer

In [2]:
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
train = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del train['lic']
train.tail()

Unnamed: 0,src,tar
232731,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
232732,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
232733,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...
232734,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...
232735,I went drinking with one of my boyfriend's fri...,« Je suis allée boire avec un ami de mon compa...


In [4]:
train.tail(1)['src'].values

array(['I went drinking with one of my boyfriend\'s friends, and now he\'s furious at me. "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you\'re right." "His name is Tom. He\'s really hot, and I really want to go drinking with him again."'],
      dtype=object)

In [5]:
# 정규표현식 활용하여 데이터 전처리
def clean_text(inputString):
    text_rmv = re.sub(r'[\\\xa0·«»]', '', inputString)
    # 다수 개의 공백을 하나의 공백으로 치환
    text_rmv = re.sub(r"\s+", " ", text_rmv)
    return text_rmv

In [6]:
train['tar'] = train['tar'].apply(lambda x: clean_text(x))
train['src'] = train['src'].apply(lambda x: clean_text(x))
train

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !
...,...,...
232731,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
232732,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
232733,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...
232734,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...


In [7]:
train.tail(1)['src'].values[0]

'I went drinking with one of my boyfriend\'s friends, and now he\'s furious at me. "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you\'re right." "His name is Tom. He\'s really hot, and I really want to go drinking with him again."'

In [8]:
train.tail(1)['tar'].values[0]

"Je suis allée boire avec un ami de mon compagnon, et voilà qu'il est furieux contre moi. Était-ce un gars ou une fille? Un gars, bien évidemment. Pourquoi irais-je boire avec ses amies? Ouais, ça se comprend. Il s'appelle Tom. Il est trop canon, et j'ai tellement envie d'aller prendre un verre avec lui à nouveau."

In [9]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
print('영어 최대 길이: ', np.max(train['src'].apply(lambda x: len(tokenizer.tokenize(x)))))
print('불어 최대 길이: ', np.max(train['tar'].apply(lambda x: len(tokenizer.tokenize(x)))))

영어 최대 길이:  88
불어 최대 길이:  112


In [11]:
tokenizer.add_special_tokens({'bos_token': '<s>'})
print(tokenizer.bos_token)
print(tokenizer.eos_token)
print(tokenizer.pad_token)

<s>
</s>
<pad>


In [12]:
tokenizer.vocab_size

32100

In [13]:
tokenizer.decode(0)

'<pad>'

In [14]:
print(tokenizer.pad_token_id)

0


In [15]:
max_len = 120

In [16]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src = self.tokenizer(list(self.data['src']), padding='max_length', truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.tokenizer(['<s>' + s for s in self.data['tar']], padding='max_length', truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [17]:
custom_ds = CustomDataset(train, tokenizer, max_len)
train_ds, valid_ds, test_ds = torch.utils.data.random_split(custom_ds, [0.8, 0.1, 0.1])
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=64, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64, shuffle=True)

In [18]:
i = 6
idx = train_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

121021
tensor([  27,   47, 1095,   12,  217,  160,  541,    5,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
I was happy to see her again.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [19]:
i = 6
idx = valid_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

70748
tensor([ 363,   97,   31,    7,   39, 2412,   58,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
What time's your train?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [20]:
i = 6
idx = test_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

205676
tensor([ 499, 4579,   19,   72, 1965,  145,   39,    7,    5,    1,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
My apartment is more comfortable than yours.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [21]:
src_texts, tar_texts = next(iter(train_dl))
print(src_texts.shape)
print(src_texts[0])
print(tokenizer.decode(src_texts[1]))
print(tar_texts.shape)
print(tar_texts[0])
print(tokenizer.decode(tar_texts[1]))

torch.Size([64, 120])
tensor([ 148,   31,  162,  530,   12, 2019,  140,    5,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
I'd like to set things straight.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [22]:
vocab_size = tokenizer.vocab_size + 1
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device).to(device)

In [23]:
tokenizer.vocab_size

32100

In [24]:
tokenizer.decode(32100)

'<s>'

In [25]:
print(model)

Transformer(
  (encoder_embedding): Embedding(32101, 512)
  (decoder_embedding): Embedding(32101, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttentionLayer(
        (W_q): Linear(in_features=512, out_features=512, bias=False)
        (W_k): Linear(in_features=512, out_features=512, bias=False)
        (W_v): Linear(in_features=512, out_features=512, bias=False)
        (W_o): Linear(in_features=512, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (fc_1): Linear(in_features=512, out_features=2048, bias=True)
        (fc_2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1,

In [26]:
src_texts.shape

torch.Size([64, 120])

In [27]:
tokenizer.vocab_size

32100

In [28]:
tokenizer.decode(src_texts[0])

"You've got to trust me.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"

In [29]:
tokenizer.decode(40000)

''

In [30]:
tokenizer.all_special_tokens

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 '<extra_id_0>',
 '<extra_id_1>',
 '<extra_id_2>',
 '<extra_id_3>',
 '<extra_id_4>',
 '<extra_id_5>',
 '<extra_id_6>',
 '<extra_id_7>',
 '<extra_id_8>',
 '<extra_id_9>',
 '<extra_id_10>',
 '<extra_id_11>',
 '<extra_id_12>',
 '<extra_id_13>',
 '<extra_id_14>',
 '<extra_id_15>',
 '<extra_id_16>',
 '<extra_id_17>',
 '<extra_id_18>',
 '<extra_id_19>',
 '<extra_id_20>',
 '<extra_id_21>',
 '<extra_id_22>',
 '<extra_id_23>',
 '<extra_id_24>',
 '<extra_id_25>',
 '<extra_id_26>',
 '<extra_id_27>',
 '<extra_id_28>',
 '<extra_id_29>',
 '<extra_id_30>',
 '<extra_id_31>',
 '<extra_id_32>',
 '<extra_id_33>',
 '<extra_id_34>',
 '<extra_id_35>',
 '<extra_id_36>',
 '<extra_id_37>',
 '<extra_id_38>',
 '<extra_id_39>',
 '<extra_id_40>',
 '<extra_id_41>',
 '<extra_id_42>',
 '<extra_id_43>',
 '<extra_id_44>',
 '<extra_id_45>',
 '<extra_id_46>',
 '<extra_id_47>',
 '<extra_id_48>',
 '<extra_id_49>',
 '<extra_id_50>',
 '<extra_id_51>',
 '<extra_id_52>',
 '<extra_id_53>',
 

In [31]:
tokenizer.get_vocab()

{'▁Fernando': 28989,
 'afli': 31364,
 '▁crank': 22703,
 '▁murit': 23286,
 '▁Canada': 1894,
 '▁rewarding': 13948,
 '▁chauffeur': 28892,
 '▁Yes': 2163,
 '].': 4275,
 '▁Open': 2384,
 '▁survivors': 20983,
 '▁epi': 9241,
 '▁bankruptcy': 14160,
 '▁trouble': 3169,
 '▁normally': 4929,
 '▁Transit': 24885,
 '▁torch': 26037,
 '▁obsolete': 29451,
 '▁Cutting': 27473,
 'obligate': 30273,
 '▁ignore': 9751,
 '▁Uttar': 31251,
 '▁toate': 1785,
 '▁jeunes': 9001,
 '▁clamp': 20084,
 '▁souffle': 29108,
 '▁also': 92,
 '▁mold': 6797,
 '▁Silber': 25953,
 'relates': 21679,
 'Developing': 26985,
 '▁pinpoint': 28598,
 'mitted': 16030,
 '▁disaster': 6912,
 '▁Civil': 7707,
 '▁Ubuntu': 22998,
 '▁Visit': 4957,
 '▁Mohamed': 29122,
 '▁Chrome': 10780,
 '▁surveillance': 12305,
 '▁motherboard': 28018,
 'living': 24257,
 '▁instinct': 16563,
 '▁verdienen': 31289,
 'obtenir': 9615,
 '▁34': 6154,
 'arian': 6855,
 'welcher': 17849,
 '▁architects': 19186,
 'dox': 11990,
 '▁grows': 13919,
 'pfel': 17433,
 '▁mourn': 24746,
 '▁cho

In [33]:
src_texts = src_texts.to(device)
tar_texts = tar_texts.to(device)
output = model(src_texts, tar_texts[:, :-1])

In [34]:
output.shape

torch.Size([64, 119, 32101])

In [35]:
tar_texts.shape

torch.Size([64, 120])

In [36]:
output = output.contiguous().view(-1, 32101)

In [37]:
output.shape

torch.Size([7616, 32101])

In [38]:
64*119

7616

In [40]:
tar_texts[:, 1:].contiguous().view(-1).shape

torch.Size([7616])