# 데이터 전처리

aihub에서 한국어-영어 번역(병렬) 말뭉치: 영어<->한국어. 카테고리별로해서 총 160만 문장인데 한 카테고리만 뽑기. https://aihub.or.kr/aidata/87


구어체로 20만건으로 진행 (문장별로 잘 나눠져 있음. 길이가 길지 않음. 고유명사 별로 없음)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator
from eunjeon import Mecab
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import random

In [None]:
data = pd.read_excel('1_구어체(1).xlsx')
print(data.shape)

(200000, 3)


Unnamed: 0,SID,원문,번역문
0,1,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 ...,Bible Coloring' is a coloring application that...
1,2,씨티은행에서 일하세요?,Do you work at a City bank?
2,3,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
3,4,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
4,5,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...


한 > 영 번역

In [None]:
test_index = random.sample(list(data.index), int(len(data.index)*0.1))

test = data.iloc[test_index,1:].reset_index(drop=True)
test.columns = ['SOURCE', 'TARGET']
test.to_csv('test.csv', index=False)

train = data[data.index.isin(test_index)==False].iloc[:,1:].reset_index(drop=True)
train.columns = ['SOURCE', 'TARGET']
train.to_csv('train.csv' , index=False)

In [None]:
mecab = Mecab()

def tokenize_ko(text):
    return [i for i in mecab.morphs(text)][::-1]

def tokenize_en(text):
    return [i for i in word_tokenize(text)]

##한국어
SOURCE = Field(tokenize=tokenize_ko,
               init_token = '<sos>',
               eos_token='<eos>')

##영어
TARGET = Field(tokenize=tokenize_en,
               lower=True,
               init_token = '<sos>',
               eos_token='<eos>')

In [None]:
data_fields = [('SOURCE', SOURCE), ('TARGET', TARGET)]

In [None]:
train, test = TabularDataset.splits(path='./', train='train.csv', test='test.csv', format='csv', 
                                    skip_header=True, fields=data_fields)

In [None]:
SOURCE.build_vocab(train, min_freq=2)
TARGET.build_vocab(train, min_freq=2)

In [None]:
SOURCE.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x0000021964D366C8>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             '.': 4,
             '는': 5,
             '이': 6,
             '을': 7,
             '에': 8,
             '하': 9,
             '은': 10,
             '가': 11,
             '있': 12,
             '를': 13,
             '의': 14,
             '고': 15,
             '습니다': 16,
             '나': 17,
             '?': 18,
             '어요': 19,
             ',': 20,
             '한': 21,
             '들': 22,
             '어': 23,
             '것': 24,
             '당신': 25,
             '그': 26,
             '우리': 27,
             '에서': 28,
             '수': 29,
             '지': 30,
             '으로': 31,
             '주': 32,
             '도': 33,
             '게': 34,
             '내': 35,
             '저': 36,
             '했': 37,
             '기': 38,
             '었': 39

In [None]:
print(len(SOURCE.vocab), len(TARGET.vocab))

25151 20739


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
BATCH_SIZE = 128

train_iterator, test_iterator = BucketIterator.splits((train, test),
                                                      batch_size = BATCH_SIZE,
                                                      device = device)

In [None]:
## batch별로 padding됨
batch = next(iter(train_iterator))
src = batch.SOURCE
print(src.shape)
src = src.transpose(1,0)
src[:3]

torch.Size([34, 128])


tensor([[    2,     4,    19,    56,    29,    41,   267,    13, 16566,     5,
            27,     3,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    2,    18,    75,    12,    29,   139,  1108,    13,   349,   236,
           201,    93,    26,     3,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    2,     4,    40,    81,    38,    39,    32,    42,  3724,    15,
            39,    32,    42,  1434,   262,    13,    17,    43,    69,    74,
            54,   836,    11,    80,    72,   335,    15,  4173,   106,    11,
            35,   356,     3,     1]])

- 보충: https://simonjisu.github.io/nlp/2018/07/05/packedsequence.html

# 모델링

코드 참고: https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

- input_dim: source vocab size. encoder input one-hot vectors dim
- emb_dim: embedding layer(input_dim->emb_dim) dim
- hid_dim: hidden states와 cell states의 dim
- n_layers: encoder에서 LSTM layer 수
- dropout

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__() 
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, SOURCE):
        # SOURCE.shape: [batch 내 seq 길이, batch_size]
        embedded = self.embedding(SOURCE)
        # embedded.shape: [batch 내 seq 길이, batch_size, emb_dim]
        embedded = self.dropout(embedded)
        
        outputs,(hidden, cell) = self.rnn(embedded)
        # hidden.shape: [n_layers, batch_size, hid_dim]
        # cell.shape: [n_layers, batch_size, hid_dim]
        
        return hidden, cell

- output_dim: target vocab size. 
- emb_dim: embedding layer(output_dim->emb_dim) dim
- hid_dim: hidden states와 cell states의 dim
- n_layers: decoder에서 LSTM layer 수
- dropout

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # 하나의 단어를 입력받음
        # 
        input = input.unsqueeze(0)
        # input.shape: [1, batch_size]
        
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        prediction = self.fc_out(output.squeeze(0))
        #prediction.shape: [batch size, output dim]
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

# 학습

In [None]:
INPUT_DIM = len(SOURCE.vocab)
OUTPUT_DIM = len(TARGET.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(25151, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(20739, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=20739, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 29,743,363 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TARGET.vocab.stoi[TARGET.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

In [None]:
N_EPOCHS = 7
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')