# PyTorch Seq2Seq를 이용한 번역 모델 구현

- 데이터 준비 및 전처리
  - 데이터 준비
  - 전처리
  - 데이터셋 분할 및 DataLoader 생성
- 모델 구축
  - Seq2Seq 모델 구현
- 모델 학습
  - 손실 함수 및 최적화기 설정
  - 모델 학습 루프 구현
- 예측 및 결과 확인
  - 임의 문장으로 예측 수행

## 데이터 준비 및 전처리

데이터 준비

In [None]:
data = [
    ('je suis etudiant', 'i am a student'),
    ('j aime le football', 'i love football'),
    ('il fait beau aujourd hui', 'it is nice today'),
    ('je mange une pomme', 'i eat an apple'),
    ('nous aimons apprendre', 'we love learning'),
    ('je vais à l école', 'i go to school'),
    ('tu es mon ami', 'you are my friend'),
    ('elle lit un livre', 'she is reading a book'),
    ('il écrit une lettre', 'he is writing a letter'),
    ('nous regardons un film', 'we are watching a movie'),
    ('vous parlez français', 'you speak french'),
    ('ils jouent au tennis', 'they play tennis'),
    ('je fais du sport', 'i do sports'),
    ('tu écoutes de la musique', 'you listen to music'),
    ('elle cuisine un gâteau', 'she is baking a cake'),
    ('il conduit une voiture', 'he is driving a car'),
    ('nous visitons le musée', 'we are visiting the museum'),
    ('vous aimez la plage', 'you love the beach'),
    ('ils dansent bien', 'they dance well'),
    ('je prends le train', 'i take the train'),
    ('tu joues de la guitare', 'you play the guitar'),
    ('elle dessine un portrait', 'she draws a portrait'),
    ('il apprend l anglais', 'he learns english'),
    ('nous voyageons en avion', 'we travel by plane'),
    ('vous travaillez dur', 'you work hard'),
    ('ils étudient la biologie', 'they study biology'),
    ('je bois du café', 'i drink coffee'),
    ('tu manges du pain', 'you eat bread'),
    ('elle porte une robe', 'she wears a dress'),
    ('il lit le journal', 'he reads the newspaper'),
    ('nous aimons la nature', 'we love nature'),
    ('vous prenez le bus', 'you take the bus'),
    ('ils chantent une chanson', 'they sing a song'),
    ('je visite Paris', 'i visit paris'),
    ('tu écris un poème', 'you write a poem'),
    ('elle étudie la médecine', 'she studies medicine'),
    ('il fait ses devoirs', 'he does his homework'),
    ('nous préparons le dîner', 'we prepare dinner'),
    ('vous jouez au basketball', 'you play basketball'),
    ('ils regardent la télévision', 'they watch television'),
    ('je dors bien', 'i sleep well'),
    ('tu travailles dans un bureau', 'you work in an office'),
    ('elle nage dans la piscine', 'she swims in the pool'),
    ('il se réveille tôt', 'he wakes up early'),
    ('nous chantons ensemble', 'we sing together'),
    ('vous écrivez des emails', 'you write emails'),
    ('ils jouent aux cartes', 'they play cards'),
    ('je visite un parc', 'i visit a park'),
    ('tu fais du vélo', 'you ride a bike'),
    ('elle regarde les étoiles', 'she watches the stars'),
    ('il monte les escaliers', 'he climbs the stairs'),
    ('nous lisons un roman', 'we read a novel'),
    ('vous écoutez la radio', 'you listen to the radio'),
    ('ils se promènent en ville', 'they walk around the city'),
    ('je cours dans le parc', 'i run in the park'),
    ('tu achètes des légumes', 'you buy vegetables'),
    ('elle joue au volley', 'she plays volleyball'),
    ('il nettoie la maison', 'he cleans the house'),
    ('nous prenons le petit déjeuner', 'we have breakfast'),
    ('vous apprenez une nouvelle langue', 'you learn a new language'),
    ('ils font la cuisine', 'they cook'),
    ('je dessine une maison', 'i draw a house'),
    ('tu regardes un documentaire', 'you watch a documentary'),
    ('elle visite un château', 'she visits a castle'),
    ('il photographie le paysage', 'he photographs the landscape'),
    ('nous organisons une fête', 'we organize a party'),
    ('vous jouez aux échecs', 'you play chess'),
    ('ils courent ensemble', 'they run together'),
    ('je regarde un match de football', 'i watch a football match'),
    ('tu lis un magazine', 'you read a magazine'),
    ('elle prépare une salade', 'she makes a salad'),
    ('il voyage en train', 'he travels by train'),
    ('nous faisons du shopping', 'we go shopping'),
    ('vous dansez au club', 'you dance at the club'),
    ('ils étudient l histoire', 'they study history'),
    ('je visite le marché', 'i visit the market'),
    ('tu achètes un cadeau', 'you buy a gift'),
    ('elle travaille dans une école', 'she works in a school'),
    ('il joue du piano', 'he plays the piano'),
    ('nous regardons le coucher du soleil', 'we watch the sunset'),
    ('vous apprenez à cuisiner', 'you learn to cook'),
    ('ils se reposent après le travail', 'they rest after work'),
    ('je prends des photos', 'i take photos'),
    ('tu fais de la natation', 'you go swimming'),
    ('elle sourit toujours', 'she always smiles'),
    ('il étudie à l université', 'he studies at the university'),
    ('nous visitons nos amis', 'we visit our friends'),
    ('vous mangez au restaurant', 'you eat at the restaurant'),
    ('ils jouent dans le jardin', 'they play in the garden'),
    ('je prends des notes', 'i take notes'),
    ('tu conduis prudemment', 'you drive carefully'),
    ('elle chante magnifiquement', 'she sings beautifully'),
    ('il lit un roman policier', 'he reads a detective novel'),
    ('nous partons en vacances', 'we go on vacation'),
    ('vous regardez les étoiles', 'you watch the stars'),
    ('ils écoutent de la musique classique', 'they listen to classical music'),
    ('je prépare un café', 'i make a coffee'),
    ('tu joues avec ton chien', 'you play with your dog'),
    ('elle porte des lunettes', 'she wears glasses'),
    ('il aime le chocolat', 'he loves chocolate')
]

전처리

In [None]:
import re
import unicodedata

# 문장 전처리 함수
def preprocess_sentence(sentence):
    # 악센트 제거
    sentence = ''.join(c for c in unicodedata.normalize('NFD', sentence)
                       if unicodedata.category(c) != 'Mn')
    # 소문자 변환
    sentence = sentence.lower()
    # 특수 문자 제거
    sentence = re.sub(r"[^a-zA-Z0-9]+", " ", sentence)
    # 양쪽 공백 제거
    sentence = sentence.strip()
    return sentence

# 입력과 출력 문장 전처리
input_texts = []
target_texts = []
input_vocab = set()
target_vocab = set()

for input_sentence, target_sentence in data:
    input_sentence = preprocess_sentence(input_sentence)
    target_sentence = preprocess_sentence(target_sentence)
    # 시작과 종료 토큰 추가
    target_sentence = '<sos> ' + target_sentence + ' <eos>'
    input_texts.append(input_sentence)
    target_texts.append(target_sentence)
    # 어휘 사전 생성
    input_vocab.update(input_sentence.split(' '))
    target_vocab.update(target_sentence.split(' '))

# 단어 사전에 PAD 토큰 추가
input_vocab = ['<pad>'] + sorted(input_vocab)
target_vocab = ['<pad>'] + sorted(target_vocab)

# 단어와 인덱스 매핑
input_word2idx = {word: idx for idx, word in enumerate(input_vocab)}
input_idx2word = {idx: word for idx, word in enumerate(input_vocab)}

target_word2idx = {word: idx for idx, word in enumerate(target_vocab)}
target_idx2word = {idx: word for idx, word in enumerate(target_vocab)}

# 최대 시퀀스 길이 계산
max_input_len = max(len(seq.split(' ')) for seq in input_texts)
max_target_len = max(len(seq.split(' ')) for seq in target_texts)

# 시퀀스를 인덱스 시퀀스로 변환하고 패딩 적용
def text_to_sequence(text, word2idx, max_len):
    seq = [word2idx[word] for word in text.split(' ')]
    seq += [word2idx['<pad>']] * (max_len - len(seq))
    return seq

input_sequences = [text_to_sequence(text, input_word2idx, max_input_len) for text in input_texts]
target_sequences = [text_to_sequence(text, target_word2idx, max_target_len) for text in target_texts]


print("입력 시퀀스 예시:", input_sequences[0])
print("출력 시퀀스 예시:", target_sequences[0])

## 모델 구축

인코더 클래스 정의

In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

    def forward(self, x):
        # x: [batch_size, seq_len]
        embedding = self.embedding(x)
        # embedding: [batch_size, seq_len, embed_size]
        outputs, (hidden, cell) = self.lstm(embedding)
        # outputs: [batch_size, seq_len, hidden_size]
        return hidden, cell

디코더 클래스 정의

In [None]:
class Decoder(nn.Module):
    def __init__(self, target_vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(target_vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, target_vocab_size)

    def forward(self, x, hidden, cell):
        # x: [batch_size], 현재 단어의 인덱스
        x = x.unsqueeze(1)
        embedding = self.embedding(x)
        # embedding: [batch_size, 1, embed_size]
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        # outputs: [batch_size, 1, hidden_size]
        predictions = self.fc(outputs.squeeze(1))
        # predictions: [batch_size, target_vocab_size]
        return predictions, hidden, cell

Seq2Seq 클래스 정의

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = len(target_vocab)

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)

        hidden, cell = self.encoder(source)

        # 첫 번째 입력은 <sos> 토큰
        input = target[:, 0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            # teacher forcing 결정
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = target[:, t] if teacher_force else top1
        return outputs

## 모델 학습

손실함수 및 최적화기 설정

In [None]:
import torch.optim as optim

# 하이퍼파라미터 설정
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)
embed_size = 16
hidden_size = 32
learning_rate = 0.001
num_epochs = 5000

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 초기화
encoder = Encoder(input_vocab_size, embed_size, hidden_size).to(device)
decoder = Decoder(target_vocab_size, embed_size, hidden_size).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

# 손실 함수와 최적화기 정의
criterion = nn.CrossEntropyLoss(ignore_index=target_word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

모델 학습 루프 구현

In [None]:
# 입력과 출력 시퀀스를 텐서로 변환
input_tensor = torch.LongTensor(input_sequences).to(device)
target_tensor = torch.LongTensor(target_sequences).to(device)

# 학습 루프
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    output = model(input_tensor, target_tensor)
    # 출력 차원 변경: [batch_size * target_len, target_vocab_size]
    output_dim = output.shape[-1]
    output = output[:, 1:].reshape(-1, output_dim)
    target = target_tensor[:, 1:].reshape(-1)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    if epoch % 500 == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')

## 예측 및 결과 확인

In [None]:
def translate(sentence):
    model.eval()
    sentence = preprocess_sentence(sentence)
    sequence = text_to_sequence(sentence, input_word2idx, max_target_len)
    sequence = torch.LongTensor(sequence).unsqueeze(0).to(device)
    with torch.no_grad():
        hidden, cell = model.encoder(sequence)
        input_token = torch.LongTensor([target_word2idx['<sos>']]).to(device)
        result = []
        for _ in range(20):
            output, hidden, cell = model.decoder(input_token, hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == target_word2idx['<eos>']:
                break
            result.append(top1.item())
            input_token = top1
    translated_sentence = ' '.join([target_idx2word[idx] for idx in result])
    return translated_sentence

In [None]:
# 예측 예시
test_sentences = [
    'je suis etudiant',
    'il fait beau aujourd hui',
    'nous aimons apprendre'
]

for sentence in test_sentences:
    translation = translate(sentence)
    print(f"입력 문장: {sentence}")
    print(f"번역 문장: {translation}\n")