In [1]:
import zipfile
import os
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer
from models import Transformer

In [2]:
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
train = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del train['lic']
train.tail()

Unnamed: 0,src,tar
232731,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
232732,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
232733,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...
232734,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...
232735,I went drinking with one of my boyfriend's fri...,« Je suis allée boire avec un ami de mon compa...


In [4]:
train.tail(1)['src'].values

array(['I went drinking with one of my boyfriend\'s friends, and now he\'s furious at me. "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you\'re right." "His name is Tom. He\'s really hot, and I really want to go drinking with him again."'],
      dtype=object)

In [5]:
# 정규표현식 활용하여 데이터 전처리
def clean_text(inputString):
    # text_rmv = re.sub(r'[\xa0·«»\\]', '', inputString)
    text_rmv = re.sub(r'[\\\xa0·«»]', '', inputString)
    # 다수 개의 공백을 하나의 공백으로 치환
    text_rmv = re.sub(r"\s+", " ", text_rmv)
    return text_rmv

In [6]:
train['tar'] = train['tar'].apply(lambda x: clean_text(x))
train['src'] = train['src'].apply(lambda x: clean_text(x))
train

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !
...,...,...
232731,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
232732,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
232733,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...
232734,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...


In [7]:
train.tail(1)['src'].values[0]

'I went drinking with one of my boyfriend\'s friends, and now he\'s furious at me. "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you\'re right." "His name is Tom. He\'s really hot, and I really want to go drinking with him again."'

In [8]:
train.tail(1)['tar'].values[0]

"Je suis allée boire avec un ami de mon compagnon, et voilà qu'il est furieux contre moi. Était-ce un gars ou une fille? Un gars, bien évidemment. Pourquoi irais-je boire avec ses amies? Ouais, ça se comprend. Il s'appelle Tom. Il est trop canon, et j'ai tellement envie d'aller prendre un verre avec lui à nouveau."

In [9]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
tokenizer.vocab_size

32100

In [11]:
print('영어 최대 길이: ', np.max(train['src'].apply(lambda x: len(tokenizer.tokenize(x)))))
print('불어 최대 길이: ', np.max(train['tar'].apply(lambda x: len(tokenizer.tokenize(x)))))

영어 최대 길이:  88
불어 최대 길이:  112


In [12]:
tokenizer.add_special_tokens({'bos_token': '<s>'})
print(tokenizer.bos_token)
print(tokenizer.eos_token)
print(tokenizer.pad_token)

<s>
</s>
<pad>


In [13]:
max_len = 120

In [14]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src = self.tokenizer(list(self.data['src']), padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        self.tar = self.tokenizer(['<s>' + s for s in self.data['tar']], padding=True, truncation=True, max_length = self.max_len, return_tensors='pt').input_ids
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.src[idx], self.tar[idx]

In [15]:
custom_ds = CustomDataset(train, tokenizer, max_len)
train_ds, test_ds = torch.utils.data.random_split(custom_ds, [0.9, 0.1])
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=64, shuffle=True)

In [16]:
i = 6
idx = test_ds.indices[i]
print(idx)
src_text, trg_text = custom_ds.__getitem__(idx)
print(src_text)
print(tokenizer.decode(src_text))
print(trg_text)
print(tokenizer.decode(trg_text))

69260
tensor([328,  31,  60,  81,  12, 456,   5,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0])
They're about to start.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
tensor([32100,  3006,     7,   527,   244,    90,   500,   

In [17]:
src_texts, tar_texts = next(iter(train_dl))
print(src_texts[0])
print(tokenizer.decode(src_texts[1]))
print(tar_texts[0])
print(tokenizer.decode(tar_texts[1]))

tensor([ 27, 174,  25,  12, 691,  30, 424,   5,   1,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0])
Can I get you a drink?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
tensor([32100,   446,    31,     9,    23,  6350,   238,   327, 22397, 

In [18]:
vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 120
dropout = 0.1

model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [19]:
print(model)

Transformer(
  (encoder_embedding): InputEmbeddings(
    (embedding): Embedding(32100, 512)
  )
  (decoder_embedding): InputEmbeddings(
    (embedding): Embedding(32100, 512)
  )
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttentionLayer(
        (W_q): Linear(in_features=512, out_features=512, bias=False)
        (W_k): Linear(in_features=512, out_features=512, bias=False)
        (W_v): Linear(in_features=512, out_features=512, bias=False)
        (W_o): Linear(in_features=512, out_features=512, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (fc_1): Linear(in_features=512, out_features=2048, bias=True)
        (fc_2): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm