In [1]:
# attention mechanism를 이용한 번역기 구현하기

In [2]:
import re
import os
import unicodedata
import urllib3
import zipfile
import shutil
import numpy as np
import pandas as pd
import torch
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

In [3]:
num_samples = 33000

In [4]:
def unicode_to_ascii(s):
  # NFD: 유니코드 문자열을 정규화하는 방법 중 하나로, 악센트 문자를 기본 문자와 결합 문자로 분리함
  # Mn: Non-Spacing Mark(악센트 등)를 나타내는 카테고리
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [5]:
def preprocess_sentence(sent):
  sent = unicode_to_ascii(sent.lower())
  sent = re.sub(r"([?.!,¿])", r" \1", sent)
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [6]:
def load_preprocessed_data():
  encoder_input, decoder_input, decoder_target = [], [], []

  with open("fra.txt", "r") as lines:
    for i, line in enumerate(lines):
      src_line, tar_line, _ = line.strip().split('\t')

      src_line = [w for w in preprocess_sentence(src_line).split()]

      tar_line = preprocess_sentence(tar_line)
      tar_line_in = [w for w in ("<sos> " + tar_line).split()]
      tar_line_out = [w for w in (tar_line + " <eos>").split()]

      encoder_input.append(src_line)
      decoder_input.append(tar_line_in)
      decoder_target.append(tar_line_out)

      if i == num_samples - 1:
        break

  return encoder_input, decoder_input, decoder_target

In [7]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 :', en_sent)
print('전처리 후 영어 문장 :',preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장 :', fr_sent)
print('전처리 후 프랑스어 문장 :', preprocess_sentence(fr_sent))

전처리 전 영어 문장 : Have you had dinner?
전처리 후 영어 문장 : have you had dinner ?
전처리 전 프랑스어 문장 : Avez-vous déjà diné?
전처리 후 프랑스어 문장 : avez vous deja dine ?


In [9]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()

In [10]:
print('인코더의 입력 :',sents_en_in[:5])
print('디코더의 입력 :',sents_fra_in[:5])
print('디코더의 레이블 :',sents_fra_out[:5])

인코더의 입력 : [['go', '.'], ['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.']]
디코더의 입력 : [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'en', 'route', '!'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!']]
디코더의 레이블 : [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['en', 'route', '!', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>']]


In [11]:
def build_vocab(sents):
  word_list = []

  for sent in sents:
      for word in sent:
        word_list.append(word)

  word_counts = Counter(word_list)
  vocab = sorted(word_counts, key=word_counts.get, reverse=True)

  word_to_index = {}
  word_to_index['<PAD>'] = 0
  word_to_index['<UNK>'] = 1

  for index, word in enumerate(vocab) :
    word_to_index[word] = index + 2

  return word_to_index

In [12]:
src_vocab = build_vocab(sents_en_in)
tar_vocab = build_vocab(sents_fra_in + sents_fra_out)

src_vocab_size = len(src_vocab)
tar_vocab_size = len(tar_vocab)
print("영어 단어 집합의 크기 : {:d}, 프랑스어 단어 집합의 크기 : {:d}".format(src_vocab_size, tar_vocab_size))

영어 단어 집합의 크기 : 4482, 프랑스어 단어 집합의 크기 : 7874


In [13]:
index_to_src = {v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}

def texts_to_sequences(sents, word_to_index):
  encoded_X_data = []
  for sent in tqdm(sents):
    index_sequences = []
    for word in sent:
      try:
          index_sequences.append(word_to_index[word])
      except KeyError:
          index_sequences.append(word_to_index['<UNK>'])
    encoded_X_data.append(index_sequences)
  return encoded_X_data

In [14]:
encoder_input = texts_to_sequences(sents_en_in, src_vocab)
decoder_input = texts_to_sequences(sents_fra_in, tar_vocab)
decoder_target = texts_to_sequences(sents_fra_out, tar_vocab)

100%|██████████| 33000/33000 [00:00<00:00, 384207.85it/s]
100%|██████████| 33000/33000 [00:00<00:00, 2242325.59it/s]
100%|██████████| 33000/33000 [00:00<00:00, 405052.27it/s]


In [15]:
for i, (item1, item2) in zip(range(5), zip(sents_en_in, encoder_input)):
    print(f"Index: {i}, 정수 인코딩 전: {item1}, 정수 인코딩 후: {item2}")

Index: 0, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 1, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 2, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 3, 정수 인코딩 전: ['go', '.'], 정수 인코딩 후: [27, 2]
Index: 4, 정수 인코딩 전: ['hi', '.'], 정수 인코딩 후: [744, 2]


In [16]:
def pad_sequences(sentences, max_len=None):
    if max_len is None:
        max_len = max([len(sentence) for sentence in sentences])

    features = np.zeros((len(sentences), max_len), dtype=int)
    for index, sentence in enumerate(sentences):
        if len(sentence) != 0:
            features[index, :len(sentence)] = np.array(sentence)[:max_len]
    return features

In [17]:
encoder_input = pad_sequences(encoder_input)
decoder_input = pad_sequences(decoder_input)
decoder_target = pad_sequences(decoder_target)

In [18]:
print('인코더의 입력의 크기(shape) :',encoder_input.shape)
print('디코더의 입력의 크기(shape) :',decoder_input.shape)
print('디코더의 레이블의 크기(shape) :',decoder_target.shape)

인코더의 입력의 크기(shape) : (33000, 7)
디코더의 입력의 크기(shape) : (33000, 16)
디코더의 레이블의 크기(shape) : (33000, 16)


In [19]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('랜덤 시퀀스 :',indices)

랜덤 시퀀스 : [11244  4137 16576 ... 20398  2046 10744]


In [20]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [21]:
print([index_to_src[word] for word in encoder_input[30997]])
print([index_to_tar[word] for word in decoder_input[30997]])
print([index_to_tar[word] for word in decoder_target[30997]])

['i', 'love', 'this', 'group', '.', '<PAD>', '<PAD>']
['<sos>', 'j', 'aime', 'ce', 'groupe', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['j', 'aime', 'ce', 'groupe', '.', '<eos>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [22]:
n_of_val = int(33000*0.1)
print('검증 데이터의 개수 :',n_of_val)

검증 데이터의 개수 : 3300


In [23]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [24]:
print('훈련 source 데이터의 크기 :',encoder_input_train.shape)
print('훈련 target 데이터의 크기 :',decoder_input_train.shape)
print('훈련 target 레이블의 크기 :',decoder_target_train.shape)
print('테스트 source 데이터의 크기 :',encoder_input_test.shape)
print('테스트 target 데이터의 크기 :',decoder_input_test.shape)
print('테스트 target 레이블의 크기 :',decoder_target_test.shape)

훈련 source 데이터의 크기 : (29700, 7)
훈련 target 데이터의 크기 : (29700, 16)
훈련 target 레이블의 크기 : (29700, 16)
테스트 source 데이터의 크기 : (3300, 7)
테스트 target 데이터의 크기 : (3300, 16)
테스트 target 레이블의 크기 : (3300, 16)


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

embedding_dim = 256
hidden_units = 256

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, tar_vocab_size, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(tar_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim + hidden_units, hidden_units, batch_first=True)
        self.fc = nn.Linear(hidden_units, tar_vocab_size)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x, encoder_outputs, hidden, cell):
        x = self.embedding(x)
        attention_scores = torch.bmm(encoder_outputs, hidden.transpose(0, 1).transpose(1, 2))
        attention_weights = self.softmax(attention_scores)
        context_vector = torch.bmm(attention_weights.transpose(1, 2), encoder_outputs)
        seq_len = x.shape[1]
        context_vector_repeated = context_vector.repeat(1, seq_len, 1)
        x = torch.cat((x, context_vector_repeated), dim=2)        
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        output = self.fc(output)
        return output, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        encoder_outputs, hidden, cell = self.encoder(src)
        output, _, _ = self.decoder(trg, encoder_outputs, hidden, cell)
        return output

encoder = Encoder(src_vocab_size, embedding_dim, hidden_units)
decoder = Decoder(tar_vocab_size, embedding_dim, hidden_units)
model = Seq2Seq(encoder, decoder)

loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())

In [26]:
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4482, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(7874, 256, padding_idx=0)
    (lstm): LSTM(512, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=7874, bias=True)
    (softmax): Softmax(dim=2)
  )
)


In [27]:
def evaluation(model, dataloader, loss_function, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for encoder_inputs, decoder_inputs, decoder_targets in dataloader:
            encoder_inputs = encoder_inputs.to(device)
            decoder_inputs = decoder_inputs.to(device)
            decoder_targets = decoder_targets.to(device)

            outputs = model(encoder_inputs, decoder_inputs)

            loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
            total_loss += loss.item()

            mask = decoder_targets != 0
            total_correct += ((outputs.argmax(dim=-1) == decoder_targets) * mask).sum().item()
            total_count += mask.sum().item()

    return total_loss / len(dataloader), total_correct / total_count

In [28]:
encoder_input_train_tensor = torch.tensor(encoder_input_train, dtype=torch.long)
decoder_input_train_tensor = torch.tensor(decoder_input_train, dtype=torch.long)
decoder_target_train_tensor = torch.tensor(decoder_target_train, dtype=torch.long)

encoder_input_test_tensor = torch.tensor(encoder_input_test, dtype=torch.long)
decoder_input_test_tensor = torch.tensor(decoder_input_test, dtype=torch.long)
decoder_target_test_tensor = torch.tensor(decoder_target_test, dtype=torch.long)

batch_size = 128

train_dataset = TensorDataset(encoder_input_train_tensor, decoder_input_train_tensor, decoder_target_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(encoder_input_test_tensor, decoder_input_test_tensor, decoder_target_test_tensor)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4482, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(7874, 256, padding_idx=0)
    (lstm): LSTM(512, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=7874, bias=True)
    (softmax): Softmax(dim=2)
  )
)

In [29]:
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()

    for encoder_inputs, decoder_inputs, decoder_targets in train_dataloader:
        encoder_inputs = encoder_inputs.to(device)
        decoder_inputs = decoder_inputs.to(device)
        decoder_targets = decoder_targets.to(device)

        optimizer.zero_grad()

        outputs = model(encoder_inputs, decoder_inputs)

        loss = loss_function(outputs.view(-1, outputs.size(-1)), decoder_targets.view(-1))
        loss.backward()

        optimizer.step()

    train_loss, train_acc = evaluation(model, train_dataloader, loss_function, device)
    valid_loss, valid_acc = evaluation(model, valid_dataloader, loss_function, device)

    print(f'Epoch: {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Valid Loss: {valid_loss:.4f} | Valid Acc: {valid_acc:.4f}')

    if valid_loss < best_val_loss:
        print(f'Validation loss improved from {best_val_loss:.4f} to {valid_loss:.4f}. 체크포인트를 저장합니다.')
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'best_model_checkpoint.pth')

Epoch: 1/30 | Train Loss: 2.9416 | Train Acc: 0.5419 | Valid Loss: 3.0714 | Valid Acc: 0.5337
Validation loss improved from inf to 3.0714. 체크포인트를 저장합니다.
Epoch: 2/30 | Train Loss: 2.2388 | Train Acc: 0.6167 | Valid Loss: 2.4771 | Valid Acc: 0.5983
Validation loss improved from 3.0714 to 2.4771. 체크포인트를 저장합니다.
Epoch: 3/30 | Train Loss: 1.7508 | Train Acc: 0.6693 | Valid Loss: 2.1174 | Valid Acc: 0.6386
Validation loss improved from 2.4771 to 2.1174. 체크포인트를 저장합니다.
Epoch: 4/30 | Train Loss: 1.3958 | Train Acc: 0.7159 | Valid Loss: 1.8888 | Valid Acc: 0.6663
Validation loss improved from 2.1174 to 1.8888. 체크포인트를 저장합니다.
Epoch: 5/30 | Train Loss: 1.1117 | Train Acc: 0.7604 | Valid Loss: 1.7251 | Valid Acc: 0.6846
Validation loss improved from 1.8888 to 1.7251. 체크포인트를 저장합니다.
Epoch: 6/30 | Train Loss: 0.8788 | Train Acc: 0.8014 | Valid Loss: 1.6023 | Valid Acc: 0.6977
Validation loss improved from 1.7251 to 1.6023. 체크포인트를 저장합니다.
Epoch: 7/30 | Train Loss: 0.7031 | Train Acc: 0.8351 | Valid Loss: 

In [30]:
model.load_state_dict(torch.load('best_model_checkpoint.pth'))

model.to(device)

val_loss, val_accuracy = evaluation(model, valid_dataloader, loss_function, device)

print(f'Best model validation loss: {val_loss:.4f}')
print(f'Best model validation accuracy: {val_accuracy:.4f}')

Best model validation loss: 1.4072
Best model validation accuracy: 0.7312


In [31]:
print(tar_vocab['<sos>'])
print(tar_vocab['<eos>'])

3
4


In [32]:
index_to_src = {v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}

def seq_to_src(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0):
      sentence = sentence + index_to_src[encoded_word] + ' '
  return sentence

def seq_to_tar(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0 and encoded_word != tar_vocab['<sos>'] and encoded_word != tar_vocab['<eos>']):
      sentence = sentence + index_to_tar[encoded_word] + ' '
  return sentence

In [33]:
print(encoder_input_test[25])
print(decoder_input_test[25])
print(decoder_target_test[25])

[  3 338  19   2   0   0   0]
[  3  14 842  70   2   0   0   0   0   0   0   0   0   0   0   0]
[ 14 842  70   2   4   0   0   0   0   0   0   0   0   0   0   0]


In [34]:
def decode_sequence(input_seq, model, src_vocab_size, tar_vocab_size, max_output_len, int_to_src_token, int_to_tar_token):
    encoder_inputs = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)

    encoder_outputs, hidden, cell = model.encoder(encoder_inputs)

    decoder_input = torch.tensor([3], dtype=torch.long).unsqueeze(0).to(device)

    decoded_tokens = []

    for _ in range(max_output_len):
        output, hidden, cell = model.decoder(decoder_input, encoder_outputs, hidden, cell)

        output_token = output.argmax(dim=-1).item()

        if output_token == 4:
            break

        decoded_tokens.append(output_token)

        decoder_input = torch.tensor([output_token], dtype=torch.long).unsqueeze(0).to(device)

    return ' '.join(int_to_tar_token[token] for token in decoded_tokens)

In [35]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_train[seq_index]
  translated_text = decode_sequence(input_seq, model, src_vocab_size, tar_vocab_size, 20, index_to_src, index_to_tar)

  print("입력문장 :",seq_to_src(encoder_input_train[seq_index]))
  print("정답문장 :",seq_to_tar(decoder_input_train[seq_index]))
  print("번역문장 :",translated_text)
  print("-"*50)

입력문장 : you may go . 
정답문장 : vous pouvez vous en aller . 
번역문장 : tu peux y aller .
--------------------------------------------------
입력문장 : are you registered ? 
정답문장 : etes vous inscrit ? 
번역문장 : etes vous inscrits ?
--------------------------------------------------
입력문장 : we had to stop . 
정답문장 : il nous fallait cesser . 
번역문장 : il nous fallut cesser .
--------------------------------------------------
입력문장 : let s forget it . 
정답문장 : oublions tout ca . 
번역문장 : oublions ca .
--------------------------------------------------
입력문장 : may i be excused ? 
정답문장 : puis je etre excuse ? 
번역문장 : puis je etre excuse ?
--------------------------------------------------


In [36]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_test[seq_index]
  translated_text = decode_sequence(input_seq, model, src_vocab_size, tar_vocab_size, 20, index_to_src, index_to_tar)

  print("입력문장 :",seq_to_src(encoder_input_test[seq_index]))
  print("정답문장 :",seq_to_tar(decoder_input_test[seq_index]))
  print("번역문장 :",translated_text)
  print("-"*50)

입력문장 : don t shoot ! 
정답문장 : ne tirez pas ! 
번역문장 : ne tire pas !
--------------------------------------------------
입력문장 : i can t allow it . 
정답문장 : je ne peux pas permettre ca . 
번역문장 : je ne peux pas le supporter .
--------------------------------------------------
입력문장 : i will watch it . 
정답문장 : je le regarderai . 
번역문장 : j y vais .
--------------------------------------------------
입력문장 : whose bike is it ? 
정답문장 : a qui est cette bicyclette ? 
번역문장 : a qui est ce velo ?
--------------------------------------------------
입력문장 : i was naive . 
정답문장 : je fus credule . 
번역문장 : j etais naif .
--------------------------------------------------
