<a href="https://colab.research.google.com/github/jungwoo1208/AI_Study/blob/main/attention_transrator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import unicodedata
import urllib3
import zipfile
import shutil
import numpy as np
import pandas as pd
import torch
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

In [2]:
num_samples = 100000

In [3]:
!wget -c https://www.manythings.org/anki/fra-eng.zip &&unzip -o fra-eng.zip

--2025-08-13 00:14:48--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8143096 (7.8M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-08-13 00:14:50 (6.02 MB/s) - ‘fra-eng.zip’ saved [8143096/8143096]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [4]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [5]:
def preprocess_sentence(sent):

  sent = unicode_to_ascii(sent.lower())
  sent= re.sub(r"([?.!,?`])",r" \1",sent)
  sent= re.sub(r"[^a-zA-Z!,?`]+"," ",sent)
  sent = re.sub(r"\s+", " ",sent)

  return sent

In [6]:
def load_preprocessed_data():
    encoder_input, decoder_input, decoder_target = [], [], []

    with open("fra.txt", "r") as lines:
        for i, line in enumerate(lines):
            src_line, tar_line, _ = line.strip().split("\t")

            src_line = [w for w in preprocess_sentence(src_line).split()]

            tar_line = preprocess_sentence(tar_line)
            tar_line_in = [w for w in ("<sos> " + tar_line).split()]
            tar_line_out = [w for w in (tar_line + " <eos>").split()]

            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)

    return encoder_input, decoder_input, decoder_target

In [7]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()

In [8]:
def build_vocab(sents):
    word_list = []
    for sent in sents:
        for word in sent:
            word_list.append(word)
    word_counts = Counter(word_list)
    vocab = sorted(word_counts, key=word_counts.get, reverse=True)

    word_to_index = {}
    word_to_index["<PAD>"] = 0
    word_to_index["<UNK>"] = 1

    for index, word in enumerate(vocab):
        word_to_index[word] = index + 2

    return word_to_index

In [9]:
src_vocab = build_vocab(sents_en_in)
tar_vocab = build_vocab(sents_fra_in+ sents_fra_out)

src_vocab_size = len(src_vocab)
tar_vocab_size = len(tar_vocab)

print(src_vocab_size)
print(tar_vocab_size)

16331
26065


In [10]:
index_to_src ={v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}

def texts_to_sequences(sents,word_to_index):
  encoded_X_data =[]
  for sent in tqdm(sents):
    index_sequences =[]
    for word in sent:
      try:
        index_sequences.append(word_to_index[word])
      except KeyError:
        index_sequences.append(word_to_index["<UNK>"])
    encoded_X_data.append(index_sequences)
  return encoded_X_data


In [11]:
encoder_input = texts_to_sequences(sents_en_in, src_vocab)
decoder_input = texts_to_sequences(sents_fra_in, tar_vocab)
decoder_target = texts_to_sequences(sents_fra_out, tar_vocab)

for i, (item1,item2) in zip(range(5), zip(sents_en_in, encoder_input)):
  print(f"index:{i}   {item1} -> {item2}")

100%|██████████| 237838/237838 [00:00<00:00, 263405.58it/s]
100%|██████████| 237838/237838 [00:00<00:00, 268379.25it/s]
100%|██████████| 237838/237838 [00:00<00:00, 718928.37it/s]

index:0   ['go'] -> [51]
index:1   ['go'] -> [51]
index:2   ['go'] -> [51]
index:3   ['go'] -> [51]
index:4   ['hi'] -> [2597]





In [12]:
def pad_sequences(sentences, max_len=None):
  if max_len is None:
    max_len = max([len(sentence) for sentence in sentences])
  features = np.zeros((len(sentences),max_len),dtype=int)
  for index, sentence in enumerate(sentences):
    if len(sentence) !=0:
      features[index, :len(sentence)] = np.array(sentence)[:max_len]
  return features

In [13]:
encoder_input = pad_sequences(encoder_input)
decoder_input = pad_sequences(decoder_input)
decoder_target = pad_sequences(decoder_target)

In [14]:
print(encoder_input.shape)
print(decoder_input.shape)
print(decoder_target.shape)

(237838, 65)
(237838, 69)
(237838, 69)


In [15]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print(indices)

[ 33175 198876 222265 ...  69374 171767 184765]


In [16]:
encoder_input= encoder_input[indices]
decoder_input= decoder_input[indices]
decoder_target= decoder_target[indices]

In [17]:
print([index_to_src[word] for word in encoder_input[6242]])
print([index_to_tar[word] for word in decoder_input[6242]])
print([index_to_tar[word] for word in decoder_target[6242]])

['we', 'need', 'to', 'get', 'you', 'out', 'of', 'here', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
['<sos>', 'on', 'doit', 'vous', 'sortir', 'de', 'la', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',

In [18]:
n_of_val =int(100000*0.1)


In [19]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [20]:
print(encoder_input_train.shape)
print(decoder_input_train.shape)
print(decoder_target_train.shape)
print(encoder_input_test.shape)
print(decoder_input_test.shape)
print(decoder_target_test.shape)

(227838, 65)
(227838, 69)
(227838, 69)
(10000, 65)
(10000, 69)
(10000, 69)


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

embadding_dim = 256
hidden_units = 256

In [22]:
class Encoder(nn.Module):
  def __init__(self, src_vocab_size, embedding_dim, hidden_units):
    super(Encoder, self).__init__()
    self.embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=0)
    self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True)

  def forward(self, x):
    x = self.embedding(x)
    outputs, (hidden, cell) = self.lstm(x)
    return outputs, hidden, cell

In [23]:
class Decoder(nn.Module):
  def __init__(self, tar_vocab_size, embedding_dim, hidden_units):
    super(Decoder, self).__init__()
    self.embedding = nn.Embedding(tar_vocab_size, embedding_dim, padding_idx=0)
    self.lstm = nn.LSTM(embedding_dim+hidden_units,hidden_units, batch_first=True)
    self.fc = nn.Linear(hidden_units, tar_vocab_size)
    self.softmax=nn.Softmax(dim=1)

  def forward(self, x, encoder_outputs, hidden, cell):
    x = self.embedding(x)
    attention_scores = torch.bmm(encoder_outputs, hidden.transpose(0,1).transpose(1,2))
    attention_weights = self.softmax(attention_scores)
    context_vector = torch.bmm(attention_weights.transpose(1,2), encoder_outputs)
    seq_len= x.shape[1]
    context_vecotr_repeated = context_vector.repeat(1, seq_len, 1)
    x = torch.cat((x, context_vecotr_repeated), dim=2)
    output, (hidden, cell) = self.lstm(x, (hidden, cell))
    output=self.fc(output)

    return output, hidden, cell

In [24]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        encoder_outputs, hidden, cell = self.encoder(src)
        decoder_outputs, _, _ = self.decoder(trg, encoder_outputs, hidden, cell)
        return decoder_outputs

encoder = Encoder(src_vocab_size, embadding_dim, hidden_units)
decoder = Decoder(tar_vocab_size, embadding_dim, hidden_units)
model = Seq2Seq(encoder, decoder)

loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())

In [25]:
def evaluation(model, dataloader, loss_function, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for encoder_input, decoder_input, decoder_target in dataloader:
            encoder_input = encoder_input.to(device)
            decoder_input = decoder_input.to(device)
            decoder_target = decoder_target.to(device)

            outputs = model(encoder_input, decoder_input)
            loss = loss_function(
                outputs.view(-1, tar_vocab_size),
                decoder_target.view(-1)
            )
            total_loss += loss.item()

            mask = decoder_target != 0  # padding 제외
            preds = outputs.argmax(dim=2)
            total_correct += (preds == decoder_target)[mask].sum().item()
            total_count += mask.sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_count if total_count > 0 else 0
    model.train()  # 평가 후 다시 학습 모드로
    return avg_loss, accuracy

In [26]:
encoder_input_train_tensor = torch.tensor(encoder_input_train, dtype=torch.long)
decoder_input_train_tensor = torch.tensor(decoder_input_train, dtype=torch.long)
decoder_target_train_tensor = torch.tensor(decoder_target_train, dtype=torch.long)

encoder_input_test_tensor = torch.tensor(encoder_input_test, dtype=torch.long)
decoder_input_test_tensor = torch.tensor(decoder_input_test, dtype=torch.long)
decoder_target_test_tensor = torch.tensor(decoder_target_test, dtype=torch.long)

batch_size = 128
train_dataset = TensorDataset(encoder_input_train_tensor, decoder_input_train_tensor, decoder_target_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(encoder_input_test_tensor, decoder_input_test_tensor, decoder_target_test_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [27]:
epochs =50
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(16331, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(26065, 256, padding_idx=0)
    (lstm): LSTM(512, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=26065, bias=True)
    (softmax): Softmax(dim=1)
  )
)

In [28]:
best_val_loss =float('inf')

for epoch in range(epochs):
  model.train()

  for encoder_input, decoder_input, decoder_target in train_dataloader:
    encoder_input = encoder_input.to(device)
    decoder_input = decoder_input.to(device)
    decoder_target = decoder_target.to(device)

    optimizer.zero_grad()
    outputs = model(encoder_input, decoder_input)

    loss= loss_function(outputs.view(-1, tar_vocab_size), decoder_target.view(-1))
    loss.backward()
    optimizer.step()
  train_loss, train_acc = evaluation(model, train_dataloader, loss_function, device)
  val_loss, val_acc = evaluation(model, test_dataloader, loss_function, device)

  print(f"Epoch: {epoch+1}")
  print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

  if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(model.state_dict(), 'best_model.pth')

Epoch: 1
Train Loss: 2.6561, Train Acc: 0.5007
Epoch: 2
Train Loss: 1.9738, Train Acc: 0.5904
Epoch: 3
Train Loss: 1.5802, Train Acc: 0.6477
Epoch: 4
Train Loss: 1.3108, Train Acc: 0.6917
Epoch: 5
Train Loss: 1.1171, Train Acc: 0.7287
Epoch: 6
Train Loss: 0.9863, Train Acc: 0.7535
Epoch: 7
Train Loss: 0.8737, Train Acc: 0.7776
Epoch: 8
Train Loss: 0.7915, Train Acc: 0.7941
Epoch: 9
Train Loss: 0.7184, Train Acc: 0.8111
Epoch: 10
Train Loss: 0.6653, Train Acc: 0.8223
Epoch: 11
Train Loss: 0.6124, Train Acc: 0.8338
Epoch: 12
Train Loss: 0.5707, Train Acc: 0.8439
Epoch: 13
Train Loss: 0.5371, Train Acc: 0.8512
Epoch: 14
Train Loss: 0.4961, Train Acc: 0.8613
Epoch: 15
Train Loss: 0.4712, Train Acc: 0.8668
Epoch: 16
Train Loss: 0.4446, Train Acc: 0.8734
Epoch: 17
Train Loss: 0.4190, Train Acc: 0.8794
Epoch: 18
Train Loss: 0.3984, Train Acc: 0.8842
Epoch: 19
Train Loss: 0.3804, Train Acc: 0.8879
Epoch: 20
Train Loss: 0.3599, Train Acc: 0.8934
Epoch: 21
Train Loss: 0.3448, Train Acc: 0.8970
E

KeyboardInterrupt: 

gpu사용량때문에 Train ACC 0.9에서 멈춤

In [29]:

model.load_state_dict(torch.load('best_model.pth'))
model.to(device)

val_loss, val_acc = evaluation(model, test_dataloader, loss_function, device)
print(f"Validation Loss: {val_loss:.4f}, Validation Acc: {val_acc:.4f}")

Validation Loss: 1.5729, Validation Acc: 0.6861


In [36]:
index_to_src ={v: k for k, v in src_vocab.items()}
index_to_tar = {v: k for k, v in tar_vocab.items()}

def seq_to_src(input_seq):
  sentence=''
  for encoded_word in input_seq:
    if(encoded_word!=0):
      sentence=sentence+index_to_src[encoded_word]+' '
  return sentence

def seq_to_tar(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if encoded_word != 0:  # 0은 패딩 제외
            sentence += index_to_tar[encoded_word] + ' '
    return sentence


In [37]:
def decode_sequence(input_seq, model, max_output_len, src_vocab, tar_vocab, index_to_tar, device):
    model.eval()

    sos_idx = tar_vocab['<sos>']
    eos_idx = tar_vocab['<eos>']

    encoder_inputs = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)

    encoder_outputs, hidden, cell = model.encoder(encoder_inputs)

    decoder_input = torch.tensor([[sos_idx]], dtype=torch.long).to(device)

    decoded_tokens = []

    with torch.no_grad():
        for _ in range(max_output_len):
            output, hidden, cell = model.decoder(decoder_input, encoder_outputs, hidden, cell)
            output_token = output.argmax(dim=-1).squeeze().item()

            if output_token == eos_idx:
                break

            decoded_tokens.append(output_token)
            decoder_input = torch.tensor([[output_token]], dtype=torch.long).to(device)

    decoded_sentence = ' '.join(index_to_tar.get(token, '[UNK]') for token in decoded_tokens)

    return decoded_sentence


In [38]:
for seq_index in [3, 50, 100, 300, 1001]:
    input_seq = encoder_input_train[seq_index]

    translated_text = decode_sequence(
        input_seq,
        model,
        max_output_len=20,
        src_vocab=src_vocab,
        tar_vocab=tar_vocab,
        index_to_tar=index_to_tar,
        device=device
    )

    print("입력 문장:", seq_to_src(encoder_input_train[seq_index]))
    print("정답 문장:", seq_to_tar(decoder_input_train[seq_index]))
    print("번역 문장:", translated_text)
    print("=" * 50)


입력 문장: i haven t fully recovered yet 
정답 문장: <sos> je n ai pas encore completement recupere 
번역 문장: je n ai pas encore completement recupere
입력 문장: tom waited his turn 
정답 문장: <sos> tom attendit son tour 
번역 문장: tom a attendu la peine d attente
입력 문장: i fixed myself something to eat 
정답 문장: <sos> je me suis prepare quelque chose a manger 
번역 문장: je me suis fait quelque chose comme ca
입력 문장: are you seriously thinking about buying that old car ? 
정답 문장: <sos> penses tu serieusement a acheter cette vieille guimbarde ? 
번역 문장: penses tu serieusement a vendre cette vieille guimbarde ?
입력 문장: how dare you talk to my son like that ! 
정답 문장: <sos> comment oses tu parler a mon fils de cette facon ! 
번역 문장: comment oses tu parler a mon fils de cette facon


처리 실수 정답문장에 <sos>가 포함되게 됨