In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, random_split
import os 

try:
  import google.colab
  IN_COLAB = True
  drive.mount('/content/drive')
except:
  IN_COLAB = False


In [2]:
if IN_COLAB:
    data_path = ""
else:
    data_path = "dataset/ko2eng"

x = np.array([])
y = np.array([])

for fname in sorted(os.listdir(data_path)):
    src_input_ids = os.path.join(data_path, fname)
    df = pd.read_excel(src_input_ids)
    x = np.concat((x, df['원문'].values))
    y = np.concat((y, df['번역문'].values))

print(x.shape)
print(y.shape)

(801387,)
(801387,)


In [3]:
x_len_array = [len(sentence) for sentence in x]
y_len_array = [len(sentence) for sentence in y]

unique_x, counts_x = np.unique(x_len_array, return_counts=True)
freq_x = np.column_stack((unique_x, counts_x))
sorted_freq_x = freq_x[freq_x[:, 1].argsort()[::-1]]

unique_y, counts_y = np.unique(y_len_array, return_counts=True)
freq_y = np.column_stack((unique_y, counts_y))
sorted_freq_y = freq_y[freq_y[:, 1].argsort()[::-1]]

print("x max:", np.max(x_len_array))
print("x mean:", np.mean(x_len_array))
print("x median:", np.median(x_len_array))
print("freq top 5:", sorted_freq_x[:5])

print("y max:", np.max(y_len_array))
print("y mean:", np.mean(y_len_array))
print("y median:", np.median(y_len_array))
print("freq top 5:", sorted_freq_y[:5])

x max: 220
x mean: 69.15487149155152
x median: 71.0
freq top 5: [[   70 18371]
 [   68 18287]
 [   69 18245]
 [   71 18199]
 [   73 17843]]
y max: 706
y mean: 173.24487170368374
y median: 174.0
freq top 5: [[ 171 5306]
 [ 179 5263]
 [ 181 5254]
 [ 175 5238]
 [ 183 5222]]


In [4]:
y_len_np = np.array(y_len_array)
x_len_np = np.array(x_len_array)


print("y_150 max:", np.max(y_len_np[y_len_np < 150]))
print("x_150 max:", np.max(x_len_np[y_len_np < 150]))

x_max_150 = x[y_len_np < 150]
y_max_150 = y[y_len_np < 150]

print(x_max_150)
print(y_max_150)
print(len(x_max_150))
print(len(y_max_150))


y_150 max: 149
x_150 max: 142
['스키너가 말한 보상은 대부분 눈으로 볼 수 있는 현물이다.' '심지어 어떤 문제가 발생할 건지도 어느 정도 예측이 가능하다.'
 '오직 하나님만이 그 이유를 제대로 알 수 있을 겁니다.' ...
 '아직 시즌 초반이라 더 지켜봐야 하지만 최근 몇 년간 라리가를 지배했던 그 힘은 보이지 않는 게 사실이다.'
 '인도에 제설작업이 잘 됐는지 살펴보기 위해 호텔 정문을 나서니 호텔 이름이 새겨진 승합차가 서 있는 것이 눈에 들어온다.'
 '영화 ‘지금 만나러 갑니다’는 비가 오는 날 다시 돌아오겠다던 약속을 남기고 떠난 수아(손예진 분)와 그녀를 기다리는 우진(소지섭 분)의 이야기다.']
["Skinner's reward is mostly eye-watering."
 'Even some problems can be predicted.' 'Only God will exactly know why.'
 ...
 "It's still early in the season, so it remains to be seen but it is true that La Liga is not strong as much as that it used to be in recent years."
 'As I left the main gate of the hotel to see if the snow removal work had gone well on the sidewalks, I could see a van with the hotel’s name on it.'
 "The movie, 'Be With You' is about a story of Sua (Son Ye Jin) and Ujin (So Ji Sup) waiting for her who left a promise to come back on a rainy day."]
281539
281539


In [5]:
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers, processors


def train_tokenizer(src, vocab_size, special_token, max_len):
    tokenizer = Tokenizer(models.WordPiece())
    tokenizer.normalizer = normalizers.NFKC()
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.post_processor = processors.BertProcessing(sep=('[SEP]', 2), cls=('[CLS]', 3))
    tokenizer.decoder = decoders.WordPiece()
    token_trainer = trainers.WordPieceTrainer(
        vocab_size = vocab_size,
        special_tokens=special_token,
        show_progress=True,
    )

    tokenizer.train_from_iterator(src, trainer=token_trainer)
    tokenizer.enable_truncation(max_len)
    tokenizer.enable_padding(length=max_len)

    return tokenizer



# ["[PAD]", "[UNK]", "[SEP]", "[CLS]"]
vocab_size = 10000
special_token = ["[PAD]", "[UNK]", "[SEP]", "[CLS]"]
max_len = 150
src_tokenizer = train_tokenizer(x_max_150, vocab_size, special_token, max_len)
tgt_tokenizer = train_tokenizer(y_max_150, vocab_size, special_token, max_len)

print(x_max_150[0])
temp = src_tokenizer.encode(x_max_150[0]) 
print(temp.tokens)
print(temp.ids)
print(src_tokenizer.decode(temp.ids))

print(y_max_150[0])
temp = src_tokenizer.encode(y_max_150[0]) 
print(temp.tokens)
print(temp.ids)
print(src_tokenizer.decode(temp.ids))







스키너가 말한 보상은 대부분 눈으로 볼 수 있는 현물이다.
['[CLS]', '스', '##키', '##너', '##가', '말한', '보상', '##은', '대부분', '눈', '##으로', '볼', '수', '있는', '현', '##물이', '##다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [6]:
x_tensors = torch.tensor(np.array([src_tokenizer.encode(entry).ids for entry in x_max_150]))
y_tensors = torch.tensor(np.array([tgt_tokenizer.encode(entry).ids for entry in y_max_150]))

dataset = TensorDataset(x_tensors, y_tensors)
train_dataset, val_dataset, test_dataset = random_split(dataset=dataset, lengths=[0.7, 0.15, 0.15])

In [7]:
batch_size = 64
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
import math

class TransformerModel(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.d_model = config['d_model'] if 'd_model' in config else 512
        self.n_head = config['n_head'] if 'n_head' in config else 8
        self.n_encoder = config['n_encoder'] if 'n_encoder' in config else 6
        self.n_decoder = config['n_decoder'] if 'n_decoder' in config else 6
        self.hidden_size = config['hidden_size'] if 'hidden_size' in config else 512
        self.seq_len = config['seq_len']
        self.device = config['device']

        self.src_embed = nn.Embedding(config['src_vocab_size'], embedding_dim=self.d_model)
        self.tgt_embed = nn.Embedding(config['tgt_vocab_size'], embedding_dim=self.d_model)

        
        self.pos_embed = PositionalEncoding(d_model=self.d_model, device=self.device)
        self.padding_mask = PaddingMask(pad_id=config['pad_id'])

        self.transformer = nn.Transformer(
            d_model=self.d_model,
            nhead=self.n_head,
            num_encoder_layers=self.n_encoder,
            num_decoder_layers=self.n_decoder,
        )


        # tgt_mask : [dec_seq, dec_seq] : 출력 Sequence Attention Mask (Trm decoding을 위한 계단식 MASK)
        # tgt_mask 입력 길이 L에 대한 (L X L) mask 생성. torch.triu 사용. (생성된 값만 보게끔 하는 부분)
        self.tgt_mask = torch.triu( torch.ones([self.seq_len, self.seq_len])).T.to(self.device)

        self.projection_layer_1 = nn.Linear(self.d_model, self.hidden_size)
        self.activation = nn.GELU()
        self.projection_layer_2 = nn.Linear(self.hidden_size, config['tgt_vocab_size'])


    def forward(self, src_input_ids, tgt_input_ids, 
                src_key_padding_mask = None, tgt_key_padding_mask = None):
        
        # src_input_ids: (batch, seq_len)
        # target_input_ids: (batch, seq_len)
        
        encoder_input_feature = self.src_embed(src_input_ids)
        # encoder_input_feuatre: (batch, seq_len, embedding)
        encoder_input_feature = encoder_input_feature.transpose(0, 1)
        # pos_embed worked based on seq_len, batch, embeding
        encoder_input_feature = self.pos_embed(encoder_input_feature)
        
        # same as encoder input handling
        decoder_input_feature = self.tgt_embed(tgt_input_ids)
        decoder_input_feature = decoder_input_feature.transpose(0, 1)
        decoder_input_feature = self.pos_embed(decoder_input_feature)

        # src : [enc_seq, batch, hidden]
        # tgt : [dec_seq, batch, hidden]
        
        # src_mask : [enc_seq, enc_seq] : 입력 Sequence Attention Mask (일반적으로 사용하지 않음)
        
        
        # src_key_padding_mask : [batch, enc_seq] : 입력 Padding Mask
        src_key_padding_mask = self.padding_mask(src_input_ids)

        # tgt_key_padding_mask : [batch, dec_seq] : 출력 Padding Mask
        tgt_key_padding_mask = self.padding_mask(tgt_input_ids)

        # src_mask는 일반적으로 구현 x
        # src_key_padding_mask : masked_fill 이용해서 생성
        # tgt_key_padding_mask
        
        out = self.transformer(encoder_input_feature, decoder_input_feature, 
                               src_mask=None, tgt_mask=self.tgt_mask,
                               src_key_padding_mask=src_key_padding_mask,
                               tgt_key_padding_mask=tgt_key_padding_mask,)
        
        out = self.projection_layer_1(out)
        out = self.activation(out)
        out = self.projection_layer_2(out)
        
        return out

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, device, dropout: float = 0.1, max_len: int = 5000) -> None:
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        pe = pe.to(device)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class PaddingMask(nn.Module):
    def __init__(self, pad_id) -> None:
        super().__init__()
        self.pad_id = pad_id

    def forward(self, x):
        return x.masked_fill(x == self.pad_id, 1).masked_fill(x != self.pad_id, 0).bool()


In [9]:
import io
from tqdm import tqdm

class Trainer():
    def __init__(self, config, model, train_dl, val_dl, criterion, optimizer) -> None:
        self.model_name = model.__class__.__name__
        self.config = config
        self.model = model
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.criterion = criterion
        self.optimizer = optimizer

        self.device = config['device'] if 'device' in config else "cpu"
        self.num_of_epoch = config['epoch'] if 'epoch' in config else 10
        self.lr = config['learning_rate'] if 'learning_rate' in config else 1e-2
        self.patience = config['patience'] if 'patience' in config else 5
        self.output_dir = config['output'] if 'output' in config else "output/"

    def train(self):
        train_loss_history = []
        train_accuracy_history = []
        val_loss_history = []
        val_accuarcy_history = []

        self.model = self.model.to(self.device)
        best_epoch = 0
        best_loss = np.inf
        epochs_no_improve = 0
        buffer = io.BytesIO()
        

        for epoch in range(self.num_of_epoch):
            self.model.train()
            epoch_loss = 0
            total_count = 0
            pbar = tqdm(self.train_dl, desc=f"epoch[{epoch+1}]")

            for step, batch in enumerate(pbar):
                x = batch[0].to(self.device)
                y = batch[1].to(self.device)

                self.optimizer.zero_grad()
                y_pred = self.model(x, y)

                # y_pred ( seq_len, batch, feature_num )
                y_pred_compare = torch.reshape(y_pred, (-1, y_pred.shape[-1]))
                # y ( batch, seq_len )
                y_target = torch.reshape(y, (-1,))

                loss = self.criterion(y_pred_compare, y_target)
                epoch_loss += loss.item() * y_target.shape[0]
                loss.backward()
                self.optimizer.step()

                total_count += y_target.shape[0]

            epoch_loss_mean = epoch_loss / total_count
            val_loss_mean = self.validation()
            pbar.set_postfix_str(f"train_loss={epoch_loss_mean:.5f}, val_loss={val_loss_mean:.5f}")
            
            train_loss_history.append(epoch_loss_mean)
            val_loss_history.append(val_loss_mean)

            if val_loss_mean < best_loss:
                best_loss = val_loss_mean
                epochs_no_improve = 0
                best_epoch = epoch
                
                buffer.seek(0)
                buffer.truncate()
                torch.save(self.model.state_dict(), buffer)
                buffer.seek(0)
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= self.patience:
                print(f'Early stopping at epoch {epoch}')
                break
        
        output_path = os.path.join(self.output_dir, f"{self.model_name}_ep_{best_epoch}_loss_{best_loss:.4f}.pt")
        print(output_path)
        with open(output_path, mode='wb') as f:
            f.write(buffer.getbuffer())

        return (train_loss_history, train_accuracy_history, val_loss_history, val_accuarcy_history), output_path

    def validation(self):
        return self.test(self.model, self.val_dl)

    def test(self, model, dataloader):
        model.eval()
        epoch_loss = 0
        for _, batch in enumerate(dataloader):
            x = batch[0].to(self.device)
            y = batch[1].to(self.device)

            y_pred = self.model(x)
            # y_pred ( seq_len, batch, feature_num )
            y_pred_compare = torch.reshape(y_pred, (-1, y_pred.shape[-1]))
            # y ( batch, seq_len )
            y_target = torch.reshape(y, (-1,))

            loss = self.criterion(y_pred_compare, y_target)
        
            epoch_loss += loss.item() * y_target.shape[0]

            total_count += y_target.shape[0]
        
        epoch_loss_mean = epoch_loss / total_count

        return epoch_loss_mean
                        
    

In [10]:
config = {
    'batch_size':batch_size,
    'src_vocab_size':src_tokenizer.get_vocab_size(),
    'tgt_vocab_size':tgt_tokenizer.get_vocab_size(),
    'd_model':512,
    'n_head':8,
    'n_encoder':6,
    'n_decoder':6,
    'hidden_size':512,
    'pad_id':0,
    'dropout':0.1,
    'device': 'cuda' if torch.cuda.is_available() else "cpu",
    'epoch': 100,
    'learning_rate':1e-3,
    'seq_len':150,
}

model = TransformerModel(config)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
trainer = Trainer(config, model, train_dl, val_dl, criterion, optimizer)
trainer.train()

epoch[1]: 100%|██████████| 3080/3080 [14:58<00:00,  3.43it/s]


AttributeError: 'Trainer' object has no attribute 'val_dataloader'