In [1]:
import pandas as pd
import torch
from torch.utils.data import random_split, Dataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import pandas as pd

data_path = "./Korean-English/2_대화체.xlsx"
data = pd.read_excel(data_path)
print(data.head())

    대분류 소분류       상황  Set Nr.  발화자                            원문  \
0  비즈니스  회의  의견 교환하기        1  A-1   이번 신제품 출시에 대한 시장의 반응은 어떤가요?   
1  비즈니스  회의  의견 교환하기        1  B-1    판매량이 지난번 제품보다 빠르게 늘고 있습니다.   
2  비즈니스  회의  의견 교환하기        1  A-2  그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요.   
3  비즈니스  회의  의견 교환하기        1  B-2   네, 제가 연락해서 주문량을 2배로 늘리겠습니다.   
4  비즈니스  회의  의견 교환하기        2  A-1   지난 회의 마지막에 논의했던 안건을 다시 볼까요?   

                                                 번역문  
0  How is the market's reaction to the newly rele...  
1  The sales increase is faster than the previous...  
2  Then, we'll have to call the manufacturer and ...  
3  Sure, I'll make a call and double the volume o...  
4  Shall we take a look at the issues we discusse...  


In [3]:
data.shape

(100000, 7)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.data.loc[idx, '원문'], self.data.loc[idx, '번역문']

custom_DS = CustomDataset(data)

[1] PositionalEncoding  
[2] MultiHeadAttention  
[3] FeedForward  
[4] EncoderLayer  
[5] DecoderLayer  
[6] Encoder  
[7] Decoder  
[8] Transformer  

In [5]:
from transformer.Models import Transformer
from transformer.Optim import ScheduledOptim
from utils.argument_parser import get_args
import os
import torch.optim as optim

In [6]:
def patch_src(src, pad_idx):
    src = src.transpose(0, 1)
    return src


def patch_trg(trg, pad_idx):
    trg = trg.transpose(0, 1)
    trg, gold = trg[:, :-1], trg[:, 1:].contiguous().view(-1)
    return trg, gold

In [7]:
opt = get_args()
opt.cuda = not opt.no_cuda
opt.d_word_vec = opt.d_model

In [8]:
# For reproducibility
if opt.seed is not None:
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = False
    # torch.set_deterministic(True)
    np.random.seed(opt.seed)
    random.seed(opt.seed)

if not opt.output_dir:
    print('No experiment result will be saved.')

if not os.path.exists(opt.output_dir):
    os.makedirs(opt.output_dir)

if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
    print('[Warning] The warmup steps may be not enough.\n'\
            '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
            'Using smaller batch w/o longer warmup may cause '\
            'the warmup stage ends with only little data trained.')

device = torch.device('cuda' if opt.cuda else 'cpu')

(sz_b, warmup) = (2048, 4000) is the official setting.
Using smaller batch w/o longer warmup may cause the warmup stage ends with only little data trained.


In [9]:
import transformer.Constants as Constants

# 1. 전체 문장 모으기
all_src_sentences = [custom_DS[i][0] for i in range(len(custom_DS))]
all_trg_sentences = [custom_DS[i][1] for i in range(len(custom_DS))]

# 2. Tokenize (공백 기준으로 단어 분리)
src_tokens = [token for sent in all_src_sentences for token in sent.split()]
trg_tokens = [token for sent in all_trg_sentences for token in sent.split()]

# 3. Special tokens
special_tokens = [Constants.PAD_WORD, Constants.BOS_WORD, Constants.EOS_WORD, Constants.UNK_WORD]

# 4. Vocab 만들기
src_vocab = {token: idx for idx, token in enumerate(special_tokens + sorted(set(src_tokens)))}
trg_vocab = {token: idx for idx, token in enumerate(special_tokens + sorted(set(trg_tokens)))}

# 5. idx2word 매핑
src_idx2word = {idx: word for word, idx in src_vocab.items()}
trg_idx2word = {idx: word for word, idx in trg_vocab.items()}

# 6. opt 세팅
max_src_len = max(len(sent.split()) for sent in all_src_sentences)
max_trg_len = max(len(sent.split()) for sent in all_trg_sentences)
opt.max_token_seq_len = max(max_src_len, max_trg_len) + 2

opt.src_pad_idx = src_vocab[Constants.PAD_WORD]
opt.trg_pad_idx = trg_vocab[Constants.PAD_WORD]
opt.src_vocab_size = len(src_vocab)
opt.trg_vocab_size = len(trg_vocab)


In [10]:
# 1. split
total_size = len(custom_DS)
train_size = int(total_size * 0.8)
val_size = int(total_size * 0.1)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(custom_DS, [train_size, val_size, test_size])

# 2. collate_fn
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    
    src_indices = []
    trg_indices = []
    for src_sent, trg_sent in zip(src_batch, trg_batch):
        src = [src_vocab.get(token, src_vocab[Constants.UNK_WORD]) for token in src_sent.split()]
        trg = [trg_vocab.get(token, trg_vocab[Constants.UNK_WORD]) for token in trg_sent.split()]

        src = [src_vocab[Constants.BOS_WORD]] + src + [src_vocab[Constants.EOS_WORD]]
        trg = [trg_vocab[Constants.BOS_WORD]] + trg + [trg_vocab[Constants.EOS_WORD]]

        src_indices.append(torch.tensor(src, dtype=torch.long))
        trg_indices.append(torch.tensor(trg, dtype=torch.long))

    src_padded = torch.nn.utils.rnn.pad_sequence(src_indices, batch_first=True, padding_value=src_vocab[Constants.PAD_WORD])
    trg_padded = torch.nn.utils.rnn.pad_sequence(trg_indices, batch_first=True, padding_value=trg_vocab[Constants.PAD_WORD])

    return src_padded, trg_padded

# 3. DataLoader
train_DL = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn)
val_DL = DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn)
test_DL = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn)

In [11]:
transformer = Transformer(
    opt.src_vocab_size,
    opt.trg_vocab_size,
    src_pad_idx=opt.src_pad_idx,
    trg_pad_idx=opt.trg_pad_idx,
    trg_emb_prj_weight_sharing=opt.proj_share_weight,
    emb_src_trg_weight_sharing=opt.embs_share_weight,
    d_k=opt.d_k,
    d_v=opt.d_v,
    d_model=opt.d_model,
    d_word_vec=opt.d_word_vec,
    d_inner=opt.d_inner_hid,
    n_layers=opt.n_layers,
    n_head=opt.n_head,
    dropout=opt.dropout,
    scale_emb_or_prj=opt.scale_emb_or_prj).to(device)

optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.lr_mul, opt.d_model, opt.n_warmup_steps)


In [12]:
import torch.nn.functional as F

def cal_performance(pred, gold, pad_idx):
    gold = gold.contiguous().view(-1)
    pred = pred.view(-1, pred.size(-1))
    
    non_pad_mask = gold.ne(pad_idx)
    n_correct = pred.max(1)[1].eq(gold).masked_select(non_pad_mask).sum().item()
    n_word = non_pad_mask.sum().item()

    loss = F.cross_entropy(pred, gold, ignore_index=pad_idx, reduction='sum')
    return loss, n_correct, n_word

from tqdm import tqdm

def train_epoch(model, train_loader, optimizer, opt, device):
    ''' 한 epoch 동안 학습하는 함수 (tqdm 추가 버전) '''
    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Training)   '
    for src_seq, trg_seq in tqdm(train_loader, mininterval=2, desc=desc, leave=False):
        src_seq = src_seq.to(device)
        trg_seq = trg_seq.to(device)

        optimizer.zero_grad()
        # decoder input은 trg_seq의 BOS ~ 마지막 단어 직전까지
        # gold는 trg_seq의 첫 단어 이후 ~ EOS까지
        gold = trg_seq[:, 1:]
        trg_seq = trg_seq[:, :-1]

        pred = model(src_seq, trg_seq)

        loss, n_correct, n_word = cal_performance(pred, gold, opt.trg_pad_idx)
        loss.backward()

        optimizer.step_and_update_lr()

        total_loss += loss.item()
        n_word_total += n_word
        n_word_correct += n_correct

    loss_per_word = total_loss / n_word_total
    accuracy = n_word_correct / n_word_total
    return loss_per_word, accuracy


In [13]:
def eval_epoch(model, val_loader, opt, device):
    ''' 한 epoch 동안 검증하는 함수 '''
    model.eval()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    with torch.no_grad():
        for src_seq, trg_seq in val_loader:
            src_seq = src_seq.to(device)
            trg_seq = trg_seq.to(device)

            gold = trg_seq[:, 1:]
            trg_seq = trg_seq[:, :-1]

            pred = model(src_seq, trg_seq)

            loss, n_correct, n_word = cal_performance(pred, gold, opt.trg_pad_idx)

            total_loss += loss.item()
            n_word_total += n_word
            n_word_correct += n_correct

    loss_per_word = total_loss / n_word_total
    accuracy = n_word_correct / n_word_total
    return loss_per_word, accuracy


In [None]:
import time

for epoch_i in range(opt.epoch):
    print(f'[ Epoch {epoch_i} ]')

    start = time.time()
    train_loss, train_accu = train_epoch(transformer, train_DL, optimizer, opt, device)
    print('  - (Training)   loss: {:.5f}, accuracy: {:.3f} %, elapse: {:.3f} min'.format(
        train_loss, 100*train_accu, (time.time()-start)/60))

    start = time.time()
    val_loss, val_accu = eval_epoch(transformer, val_DL, opt, device)
    print('  - (Validation) loss: {:.5f}, accuracy: {:.3f} %, elapse: {:.3f} min'.format(
        val_loss, 100*val_accu, (time.time()-start)/60))

    # 모델 저장
    checkpoint = {
        'epoch': epoch_i,
        'model': transformer.state_dict(),
        'opt': opt,
        'src_vocab': src_vocab,
        'trg_vocab': trg_vocab
    }
    torch.save(checkpoint, f'{opt.output_dir}/transformer_epoch{epoch_i}.chkpt')


In [None]:
# Test 데이터 샘플 5개 번역해보기
from torch.nn.functional import softmax

def greedy_decode(model, src_seq, opt, device):
    model.eval()
    src_seq = src_seq.unsqueeze(0).to(device)  # (batch=1)
    src_mask = (src_seq != opt.src_pad_idx).unsqueeze(-2)

    enc_output, *_ = model.encoder(src_seq, src_mask)

    # 초기 입력은 BOS 토큰
    dec_input = torch.LongTensor([[opt.trg_pad_idx]]).fill_(trg_vocab[Constants.BOS_WORD]).to(device)

    pred_tokens = []
    for _ in range(opt.max_token_seq_len):
        dec_mask = (dec_input != opt.trg_pad_idx).unsqueeze(-2) & get_subsequent_mask(dec_input)
        dec_output, *_ = model.decoder(dec_input, dec_mask, enc_output, src_mask)
        seq_logit = model.trg_word_prj(dec_output)
        next_word = seq_logit[:, -1, :].argmax(-1)
        pred_tokens.append(next_word.item())
        dec_input = torch.cat([dec_input, next_word.unsqueeze(0)], dim=1)
        if next_word.item() == trg_vocab[Constants.EOS_WORD]:
            break
    return pred_tokens

# 5개 출력
print("\n[Sample Translation Results]")
for src_seq, trg_seq in list(test_DL)[:1]:   # batch 하나 가져오기
    src_seq = src_seq.to(device)
    for i in range(5):
        src_tokens = [src_idx2word[idx.item()] for idx in src_seq[i] if idx.item() != src_vocab[Constants.PAD_WORD]]
        pred_idx_seq = greedy_decode(model, src_seq[i], opt, device)
        pred_tokens = [trg_idx2word[idx] for idx in pred_idx_seq]

        print(f"Source: {' '.join(src_tokens)}")
        print(f"Predicted Translation: {' '.join(pred_tokens)}")
        print("-" * 50)
    break
