# 데이터 전처리

aihub에서 한국어-영어 번역(병렬) 말뭉치: 영어<->한국어. 카테고리별로해서 총 160만 문장인데 한 카테고리만 뽑기. https://aihub.or.kr/aidata/87


구어체로 20만건으로 진행 (문장별로 잘 나눠져 있음. 길이가 길지 않음. 고유명사 별로 없음)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
#from torchtext.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
!pip install konlpy
from konlpy.tag import Okt, Mecab
import nltk
nltk.download('punkt')  
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import time
from tqdm import tqdm
import math

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
SEED = 2021

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
data = pd.read_excel('1_구어체(1).xlsx')
print(data.shape)

한 > 영 번역

In [None]:
data_ = data.iloc[:,1:].rename(columns={'원문':'SRC','번역문':'TRG'})
train, test = train_test_split(data_, test_size=0.1, random_state=SEED)
train, val = train_test_split(train, test_size=0.1, random_state=SEED)

print(train.shape[0], val.shape[0], test.shape[0])

162000 18000 20000


In [None]:
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
okt = Okt()

def tokenize_ko(text):
    return [i for i in okt.morphs(text)][::-1]

def tokenize_en(text):
    return [i for i in word_tokenize(text)]

##한국어
SRC = Field(tokenize=tokenize_ko,
               init_token = '<sos>',
               eos_token='<eos>')

##영어
TRG = Field(tokenize=tokenize_en,
               lower=True,
               init_token = '<sos>',
               eos_token='<eos>')

In [None]:
data_fields = [('SRC', SRC), ('TRG', TRG)]

In [None]:
train, val, test = TabularDataset.splits(path='/content/drive/MyDrive/', train='train.csv', validation='val.csv', test='test.csv',  
                                         format='csv', skip_header=True, fields=data_fields)

In [None]:
SRC.build_vocab(train, min_freq=2)
TRG.build_vocab(train, min_freq=2)

In [None]:
print(len(SRC.vocab), len(TRG.vocab))
print(list(SRC.vocab.stoi.keys())[:20])

34012 19819
['<unk>', '<pad>', '<sos>', '<eos>', '.', '을', '이', '는', '에', '가', '를', '의', '은', '나', '?', '것', ',', '당신', '그', '들']


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
BATCH_SIZE = 80
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, val, test), 
                                                                      sort=False,                                                                 
                                                                       batch_size = BATCH_SIZE,
                                                                       device = device
                                                                       )

In [None]:
## batch별로 padding됨
batch = next(iter(train_iterator))
src = batch.SRC
print(src.shape)
src = src.transpose(1,0)
src[:3]

torch.Size([28, 80])


tensor([[    2,     4,    26,    55,   612,    21,    27,     5,    15,   135,
           168,    30,    15,    32,    10,   606,   214,    15,  2475,    22,
           350,    71,     7,    25,   320,     3,     1,     1],
        [    2,     4,   559,    89,    22,  1833,     7,   315, 13003,     3,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [    2,     4,  2989,     6,    40,    32,     5,    53,    46,   863,
          3542, 11143,   551,  2503,     6,    19,    40,     7,    20,   156,
             3,     1,     1,     1,     1,     1,     1,     1]],
       device='cuda:0')

- 보충: https://simonjisu.github.io/nlp/2018/07/05/packedsequence.html

# 모델링

코드 참고: https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb#scrollTo=g64I1EDu6jlz

## Encoder

**Bidirectional-RNN**
1. 각 rnn block
    - 입력: forward, backward RNN에 대해 각각 입력받음
        - e(x(t))
            - x(t): 현재 source token
            - e(x(t)): embedding된 x(t)
        - h(t-1): 이전 encoder rnn block 의 hidden state

    - 출력
        - h(t): 현재 encoder rnn block의 hidden state


2. 전체 encder
    - 출력: $z=\tanh(g(h_T^\rightarrow, h_T^\leftarrow)) = \tanh(g(z^\rightarrow, z^\leftarrow)) = s_0$

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #input_dim: source vocab 크기
        self.rnn = nn.RNN(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim) # forward + backward rnn 결과를 입력으로 받아, dec_hidden state 초기 input을 출력
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src): #src.shape: [src_len, batch_size]
        
        ## 1.encoder input token embedding ##
        embedded = self.embedding(src)
        embedded = self.dropout(embedded) #embeeded.shape: [src_len, batch_size, emb_dim]
        
        ## 2. rnn layer에 넣고 모든 block에 대해 계산 ##
        outputs, hidden = self.rnn(embedded)
        # outputs: 모든 timestep의 hidden states 저장됨. 이전 seq2seq 모든 decoder step에서 동일한 context vector를 사용했었기 때문에 필요없었음.
        # outputs.shape: [scr_len, batch_size, hid_dim*2]
        # hidden: 마지막 hidden state. decoder의 초기 hidden state로 적용됨 (h(0))
        # hidden.shape: [2, batch_size, hid_dim]
        
        ## 3. 원하는 output him으로 변형 ##
        hidden = torch.tanh(self.fc( 
            torch.cat((hidden[-2,:,:], # forward RNN에서 마지막 timestep (<eos>)을 지난 후의 hidden state
                       hidden[-1,:,:]), # backward RNN에서 마지막 timestpe (<sos>)을 지난 후의 hidden state
                      dim=1) # 하나의 output이 나오도록 (decoder에서는 하나의 입력만 받음)
        )) 
        #hidden.shape: [batch_size, dec_hid_dim]
        
        return outputs, hidden

## Attention

![image.png](attachment:image.png)

1. 각 decoder block (token)에 대한 attention vector
    - 입력
        - s(t-1): 이전 decoder hidden state
        - H: encoder의 모든 forward & backward hidden states 
    - 출력
        - a(t): 하나의 attention vector
            - shape: [입력 token 수]
            - t번째 output인 s(t)를 계산할 때 전체 입력의 정보가 모두 있는 a(t)가 필요함
            - 각 value는 0~1. sum(a(t))=1


In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim) # encoder forward + encoder backward + decoder input embedding 
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        # hidden: 이전 timestep decoder의 hidden state
        # encoder_outputs: encoder의 모든 timestep의 hidden states 

        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        # hidden.shape: [batch_size, dec_hid_dim]
        hidden = hidden.unsqueeze(1) # 1번째 차원 추가
        # hidden.shape: [batch_size, 1, dec_hid_dim]
        hidden = hidden.repeat(1, src_len, 1) # token 길이만큼 반복
        # hidden.shape: [batch_size, src_len, dec_hid_dim]
        
        # encoder_outputs.shape: [scr_len, batch_size, hid_dim*2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2) # reshape. 순서 바꾸기
        # encoder_outputs.shape: [batch_size, scr_len, hid_dim*2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        # energy: encoder 각 timestep의 hidden state가 decoder의 이전 timestep과 얼마나 잘 매칭되는지
        # energy.shape: [batch_size, src_len, dec_hid_dim]
        
        
        attention = self.v(energy) #attention.shape: [batch_size, src_len, 1]
        attention = attention.squeeze(2) #attention.shape: [batch size, src_len]
        
        return F.softmax(attention, dim=1) # sum=1이 되도록 바꾸기

## Decoder

![image.png](attachment:image.png)
1. decoder rnn 입력
    - w(t)
        - 모든 입력 token들에 대한 attention을 적용한 context vector
        - w(t) = a(t) * H
    - d(y(t))
        - y(t): 현재 target token
        - d(y(t)): y(t)를 embedding한 결과
    - s(t-1): 이전 decoder rnn block 의 hidden state
    
    
2. decoder rnn 출력
    - s(t): 현재 deocder rnn block 의 hidden state


3. linear 입력
    - d(yt(t))
    - w(t)
    - s(t)
    
    
4. linear 출력
    - y(t+1): 예측한 target token

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.RNN((enc_hid_dim*2)+emb_dim, dec_hid_dim)
        self.fc = nn.Linear((enc_hid_dim*2)+dec_hid_dim+emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        # input: 각 timestep의 target token
        # hidden: 이전 timestep decoder block의 hidden state
        # encoder_outputs: encoder의 모든 timestep의 hidden states 
        
        
        ## 1.decoder input token embedding ##
        # input.shape: [batch_size]
        input = input.unsqueeze(0)
        # input.shape: [1, batch_size]
        
        embedded = self.embedding(input) 
        embedded = self.dropout(embedded)
        # embedded.shape: [1, batch_size, emb_dim]
        
        
        ## 2-1.현재 timestep decoder token과 source token들간의 attention score계산 ## 
        a = self.attention(hidden, encoder_outputs)
        # a.shape: [batch_size, src_len]
        a = a.unsqueeze(1)
        # a.shape: [batch_size, 1, src_len]
        
        # encoder_outputs.shape: [scr_len, batch_size, hid_dim*2]
        encoder_outputs = encoder_outputs.permute(1,0,2)
        # encoder_outputs.shape: [batch_size, scr_len, hid_dim*2]
        
        ## 2-2. 각 encoder hidden states에 attention score 적용 ## 
        weighted = torch.bmm(a, encoder_outputs) # batch matrix-matrix product
        # weighted.shape: [batch_size, 1, enc_hid_dim*2]
        weighted = weighted.permute(1,0,2) 
        # weighted.shape: [1, batch_size, enc_hid_dim*2]
        
        
        ## 3. rnn layer에 넣고 모든 block에 대해 계산 ##
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output: 현재 timestep의 hidden state
        # hidden: 현재 timestep의 hidden state
        # output.shape: [1, batch_size, dec_hid_dim]
        # hidden.shape: [1, batch_size, dec_hid_dim]
        
        assert (output==hidden).all() # 둘이 같은지 확인
        
        
        ## 4. 원하는 output dim으로 변형 후 linear에 넣어 vocab 내 각 단어에 대한 예측값 뽑기 ##
        embedded = embedded.squeeze(0) 
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc(torch.cat((output, weighted, embedded), dim=1))
        # prediction.shape = [batch_size, output_dim]
        
        return prediction, hidden.squeeze(0) 
                            # 매 timestep마다 새로운 hidden을 넘겨줌

## Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        ## 0. vocab_size만큼의 softmax 결과를 담을 수 있는 zero tensor 만들기 ##
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        
        ## 1. encoder에 넣기 ##
        encoder_outputs, hidden = self.encoder(src)
        
    
        ## 2. decoder에 넣고 결과 뽑기 ##
        input = trg[0,:] # decoder의 첫 번째 token인 <sos>를 처음 입력값으로
        for t in range(1, trg_len): # 각 decoder의 token에 대해
            
            ## decoder에 넣고 계산
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            ## 만들어 놓은 빈 tensor에 결과값 저장
            outputs[t] = output
            
            ## teacher force를 쓸지 결정
            teacher_force = random.random() < teacher_forcing_ratio
            
            ## prediction 중 가장 예측값이 높은 단어의 index 뽑기
            top1 = output.argmax(1) 
            
            input = trg[t] if teacher_force else top1 

        return outputs

# 학습

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(34012, 256)
    (rnn): RNN(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(19819, 256)
    (rnn): RNN(1280, 512)
    (fc): Linear(in_features=1792, out_features=19819, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 52,335,467 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.SRC
        trg = batch.TRG
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.SRC
            trg = batch.TRG
            
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS)):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

torch.sve















  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A













 20%|██        | 1/5 [17:59<1:11:58, 1079.73s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A

Epoch: 01 | Time: 17m 59s
	Train Loss: 5.471 | Train PPL: 237.737
	 Val. Loss: 5.584 |  Val. PPL: 266.059
















 40%|████      | 2/5 [36:01<54:00, 1080.21s/it]  [A[A[A[A[A[A[A[A[A[A[A[A[A[A

Epoch: 02 | Time: 18m 0s
	Train Loss: 4.765 | Train PPL: 117.340
	 Val. Loss: 5.220 |  Val. PPL: 184.850
















 60%|██████    | 3/5 [54:04<36:02, 1081.04s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A

Epoch: 03 | Time: 18m 2s
	Train Loss: 4.344 | Train PPL:  77.023
	 Val. Loss: 4.949 |  Val. PPL: 141.096
















 80%|████████  | 4/5 [1:12:06<18:01, 1081.59s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A

Epoch: 04 | Time: 18m 2s
	Train Loss: 4.025 | Train PPL:  55.997
	 Val. Loss: 4.758 |  Val. PPL: 116.527
















100%|██████████| 5/5 [1:30:09<00:00, 1081.96s/it]

Epoch: 05 | Time: 18m 2s
	Train Loss: 3.792 | Train PPL:  44.334
	 Val. Loss: 4.710 |  Val. PPL: 111.071



