20240828 seq2seq

1. load data
2. tokenize data
3. lstm/gru encoder (with personalized number of layers and embeddings)
4. lstm/gru decoder (with personalized number of layers and embeddings)
5. toeknized output
6. de-toeknize into str
7. accuracy metrics calculations
<br>https://github.com/bentrevett/pytorch-seq2seq/blob/main/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import MarianTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import random
import time
import math

In [3]:
# ==================================================
# Step 3: 모델 및 학습 설정
BATCH_SIZE = 64 ## 중복
EPOCH = 50
max_len = 512
drop_p = 0.1

d_model = 128
n_layers = 3

d_hidden = 512 


In [4]:
# ==================================================
# Step 1: 데이터 준비
# Pandas DataFrame 형식으로 학습 데이터를 준비합니다.
# 학습 데이터는 ecmData 변수에 저장되어야 합니다. 
# 입력 데이터 예시
ecmData = pd.read_csv('대화체_y_hat.txt', sep='\t', encoding='cp949')[['원문', '번역문']] #header=None
ecmData = pd.read_csv('kor2.txt', sep='\t', encoding='cp949', header=None)
ecmData.head()

Unnamed: 0,0,1
0,Go.,가.
1,Hi.,안녕.
2,Run!,뛰어!
3,Run.,뛰어.
4,Who?,누구?


In [5]:
# Step 2: 데이터 전처리
class CustomDataset(torch.utils.data.Dataset): 
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        src = self.data.iloc[idx, 0]
        trg = self.data.iloc[idx, 1]
        return (src, trg)
    
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [6]:
# ==================================================
# Step 2: 데이터 전처리
custom_DS = CustomDataset(ecmData.iloc[:4000])
train_DL = torch.utils.data.DataLoader(custom_DS, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(ecmData.iloc[4000:])  
test_DL = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda x: x)


In [7]:
# ======================
# TOKEN
tokenizer = MarianTokenizer.from_pretrained(r"C:\Program Files\ECMiner\ECMiner_x64_withVOD_v.5.2.0.7592_20240719\Miniconda3\ModelSrc\Transformer_Transformer\model_directory")
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id



In [8]:
# SET PARAMETERS
class EarlyStopping:
        def __init__(self, patience=10, verbose=False, delta=0, target_loss=None):
            self.patience = patience
            self.verbose = verbose
            self.counter = 0
            self.best_score = None
            self.early_stop = False
            self.val_loss_min = np.Inf
            self.delta = delta
            self.target_loss = target_loss

        def __call__(self, val_loss, model):
            score = -val_loss

            if self.target_loss is not None and val_loss <= self.target_loss:
                self.early_stop = True
                if self.verbose:
                    print(f'Target loss {self.target_loss} reached. Stopping training.',flush=True)
                return

            if self.best_score is None:
                self.best_score = score
            elif score < self.best_score + self.delta:
                self.counter += 1
                if self.counter >= self.patience:
                    self.early_stop = True
                    if self.verbose :
                        print(f'No improvement for {self.patience} epochs. Early stopping.',flush=True)
            else:
                self.best_score = score
                self.counter = 0


In [9]:
# params = {}
# params['batch_size'] = int(BATCH_SIZE)
# params['epoch'] = int(EPOCH) 
# params['max_len'] = max_len
# params['tokenizer'] = tokenizer
# params['early_stop'] = EarlyStopping(verbose=True, target_loss = 0.0001)

params = {
        'batch_size': BATCH_SIZE,
        'epoch': EPOCH,
        'max_len': max_len,
        'tokenizer': tokenizer,
        'device': torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        'early_stopping' : EarlyStopping(verbose=True, target_loss = 0.0001)
    }

In [10]:
# 학습 loss 계산
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [11]:
# 학습 모델 구성 (encoder, decoder > seq2seq)
# Encoder class using LSTM
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, drop_p):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, dropout=drop_p, batch_first=True)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x):
        x = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell
    
# Decoder class using LSTM
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, drop_p):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, dropout=drop_p, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(drop_p)
        
    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        x = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions, hidden, cell
    
# Seq2Seq class combining Encoder and Decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, trg_vocab_size, teacher_forcing_ratio=0.5):
        batch_size = len(src) #src.shape[0]
        trg_len = np.shape(trg)[1] #trg.shape[1]
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        x = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            # output, hidden, cell = self.decoder(hidden, cell)
            outputs[:, t] = output
            # outputs[t] = output
            top1 = output.argmax(1)
            x = trg[:, t] if np.random.random() < teacher_forcing_ratio else top1
        
        return outputs

In [12]:
# =========================
# 학습 모델 구성
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
encoder = Encoder(vocab_size, d_model, d_hidden, n_layers, drop_p)
decoder = Decoder(vocab_size, d_model, d_hidden, n_layers, drop_p)
model = Seq2Seq(encoder, decoder, params['device']).to(params['device'])
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [13]:
# 훈련시 시간 출력을 위한 util 함수
import math
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{int(m)}m {int(s)}s'

# 훈련시 시간 출력을 위한 util 함수
def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return f'{as_minutes(s)} (remaining: {as_minutes(rs)})'

# # 훈련시 training loss 를 출력하기 위한 util 함수
# def showPlot(points):
#     plt.figure()
#     fig, ax = plt.subplots()
#     # 주기적인 간격에 이 locator가 tick을 설정
#     loc = ticker.MultipleLocator(base=0.2)
#     ax.yaxis.set_major_locator(loc)
#     plt.plot(points)
#     plt.title('Losses over training')
#     plt.show()

In [14]:
# Step 4: 모델 학습
# 모델 학습 세부 함수
# Loss calculation per epoch
def loss_epoch(model, dataloader, criterion, optimizer, params, epoch):
    epoch_loss = 0

    for batch_idx, (src, trg) in enumerate(dataloader):
        print(f'{batch_idx+1}/{len(dataloader)} batch processing within #{epoch+1} epoch', end='\r')
        # Move both src and trg to the device
        src = tokenizer(src, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        # trg_texts = ['</s> ' + s for s in trg]
        trg = tokenizer(trg, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        
        optimizer.zero_grad()
        output = model(src, trg, trg_vocab_size=params['tokenizer'].vocab_size)
        # output = model(src, trg, trg_vocab_size=params['tokenizer'].vocab_size)
        
        # Reshape output and target for loss computation
        output = output[:, 1:].reshape(-1, output.shape[-1])
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)

        # loss = criterion(output.permute(0, 2, 1), trg[:, 1:]) 
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

# 모델 학습 함수
# Training function
def Train(model, dataloader, criterion, optimizer, params):
    model.train()
    history = []
    start = time.time()
    for epoch in range(params['epoch']):
        epoch_loss = loss_epoch(model, dataloader, criterion, optimizer, params, epoch)
        history.append(epoch_loss)
        print(f'Epoch [{epoch+1}/{params["epoch"]}], Loss: {epoch_loss:.4f} \
              === {time_since(start, (epoch+1)/params['epoch'])}')
        
    return history



In [15]:
# ==================================================
# Step 4: 모델 학습
train_losses = Train(model, train_DL, criterion, optimizer, params)
# showPlot(train_losses)

Epoch [1/50], Loss: 6.6073               === 1m 46s (remaining: 86m 55s)
Epoch [2/50], Loss: 5.4963               === 4m 43s (remaining: 113m 32s)
Epoch [3/50], Loss: 5.3503               === 7m 25s (remaining: 116m 16s)
Epoch [4/50], Loss: 5.2067               === 10m 17s (remaining: 118m 21s)
Epoch [5/50], Loss: 5.0899               === 13m 29s (remaining: 121m 23s)
Epoch [6/50], Loss: 4.9989               === 16m 27s (remaining: 120m 40s)
Epoch [7/50], Loss: 4.8872               === 19m 0s (remaining: 116m 47s)
Epoch [8/50], Loss: 4.7836               === 21m 57s (remaining: 115m 14s)
Epoch [9/50], Loss: 4.6846               === 24m 31s (remaining: 111m 45s)
Epoch [10/50], Loss: 4.5977               === 27m 22s (remaining: 109m 28s)
Epoch [11/50], Loss: 4.5381               === 29m 58s (remaining: 106m 16s)
Epoch [12/50], Loss: 4.4498               === 32m 52s (remaining: 104m 7s)
Epoch [13/50], Loss: 4.3963               === 35m 48s (remaining: 101m 55s)
Epoch [14/50], Loss: 4.3186

In [None]:
train_losses

[6.62136303432404, 5.512055487859817]

Evaluation

In [None]:
model.eval()
test_loss = 0
with torch.no_grad():
    for src_batch, trg_batch in tqdm(train_DL):
        src_batch = tokenizer(src_batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        trg_batch = tokenizer(trg_batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        
        output = model(src_batch, trg_batch, trg_vocab_size=vocab_size, teacher_forcing_ratio=0)
        output_dim = output.shape[-1]
        
        # Flatten the output and target tensors
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg_batch[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        test_loss += loss.item()

print(f'Test Loss: {test_loss / len(test_DL)}')

100%|██████████| 63/63 [00:29<00:00,  2.11it/s]

Test Loss: 11.126292673746745





In [None]:
test_loss = 0
translated_sentences = []

with torch.no_grad():
    for src_batch, trg_batch in tqdm(train_DL):
        src_batch = tokenizer(src_batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        trg_batch = tokenizer(trg_batch, padding=True, truncation=True, max_length=max_len, return_tensors='pt').input_ids
        
        output = model(src_batch, trg_batch, trg_vocab_size=vocab_size, teacher_forcing_ratio=0)
        
        # Get the predicted token IDs
        _, predicted_ids = torch.max(output, dim=-1)
        
        # Convert to list and check for repetitive patterns
        for sentence_ids in predicted_ids:
            sentence_list = sentence_ids.tolist()
            
            # 찾은 EOS 토큰 이후의 모든 토큰을 무시합니다.
            if tokenizer.eos_token_id in sentence_list:
                sentence_list = sentence_list[:sentence_list.index(tokenizer.eos_token_id)]
            
            # 다시 텍스트로 변환
            translated_text = tokenizer.decode(sentence_list, skip_special_tokens=True)
            translated_sentences.append(translated_text)

100%|██████████| 63/63 [00:25<00:00,  2.46it/s]


In [None]:
# 출력된 번역된 문장들 확인
for original, translated in zip(ecmData[0].tolist(), translated_sentences):
    print(f'Original: {original}')
    print(f'Translated: {translated}')
    print('---')

Original: Go.
Translated: 
---
Original: Hi.
Translated: 
---
Original: Run!
Translated: 
---
Original: Run.
Translated: 
---
Original: Who?
Translated: 
---
Original: Wow!
Translated: 
---
Original: Duck!
Translated: 
---
Original: Fire!
Translated: 
---
Original: Help!
Translated: 
---
Original: Hide.
Translated: 
---
Original: Jump!
Translated: 
---
Original: Jump.
Translated: 
---
Original: Stay.
Translated: 
---
Original: Wait!
Translated: 
---
Original: Wait!
Translated: 
---
Original: Wait.
Translated: 
---
Original: Begin.
Translated: 
---
Original: Hello!
Translated: 
---
Original: Hello.
Translated: 
---
Original: I see.
Translated: 
---
Original: I try.
Translated: 
---
Original: I won!
Translated: 
---
Original: Oh no!
Translated: 
---
Original: Relax.
Translated: 
---
Original: Shoot!
Translated: 
---
Original: Smile.
Translated: 
---
Original: Sorry?
Translated: 
---
Original: Sorry?
Translated: 
---
Original: Sorry?
Translated: 
---
Original: Attack!
Translated: 
---
Ori