In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from sklearn.model_selection import train_test_split

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from konlpy.tag import Mecab

import random

import time

import numpy as np

import math

import nltk
from nltk.tokenize import word_tokenize

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df_korean = pd.read_csv("./korean.csv", encoding="utf-8-sig")
df_eng = pd.read_csv("./english.csv", encoding="utf-8-sig")

In [5]:
df_eng.head()

Unnamed: 0,번역문
0,how is the market is reaction to the newly rel...
1,the sales increase is faster than the previous...
2,then we will have to call the manufacturer and...
3,sure i will make a call and double the volume ...
4,shall we take a look at the issues we discusse...


In [6]:
df = pd.concat([df_korean, df_eng], axis=1)

In [7]:
df.head()

Unnamed: 0,원문,번역문
0,이번 신제품 출시에 대한 시장의 반응은 어떤가요,how is the market is reaction to the newly rel...
1,판매량이 지난번 제품보다 빠르게 늘고 있습니다,the sales increase is faster than the previous...
2,그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요,then we will have to call the manufacturer and...
3,네 제가 연락해서 주문량을 2배로 늘리겠습니다,sure i will make a call and double the volume ...
4,지난 회의 마지막에 논의했던 안건을 다시 볼까요,shall we take a look at the issues we discusse...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   원문      100000 non-null  object
 1   번역문     100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [9]:
dataset = np.array(df)

In [10]:
trainset, valset= train_test_split(dataset, test_size=0.1)

In [11]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]

In [12]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer_kor(text):
    return m.morphs(text)

def tokenizer_eng(text):
    return word_tokenize(text)

In [13]:
word2idx_kor = {}
word2idx_kor["PAD"] = 0
word2idx_kor["UNK"] = 1
word2idx_kor["<sos>"] = 2
word2idx_kor["<eos>"] = 3
word2idx_eng = {}
word2idx_eng["PAD"] = 0
word2idx_eng["UNK"] = 1
word2idx_eng["<sos>"] = 2
word2idx_eng["<eos>"] = 3

In [14]:
count = 4

for i in range(len(X_train)):
    X_train[i] = tokenizer_kor(X_train[i])
    for token in X_train[i]:
        if token not in word2idx_kor.keys():
            word2idx_kor[token] = count
            count += 1
    X_train[i] = ["<sos>"] + X_train[i] + ["<eos>"]
    
for i in range(len(X_val)):
    X_val[i] = tokenizer_kor(X_val[i])
    X_val[i] = ["<sos>"] + X_val[i] + ["<eos>"]

In [15]:
count = 4

for i in range(len(y_train)):
    y_train[i] = tokenizer_eng(y_train[i])
    for token in y_train[i]:
        if token not in word2idx_eng.keys():
            word2idx_eng[token] = count
            count += 1
    y_train[i] = ["<sos>"] + y_train[i] + ["<eos>"]

for i in range(len(y_val)):
    y_val[i] = tokenizer_eng(y_val[i])
    y_val[i] = ["<sos>"] + y_val[i] + ["<eos>"]

In [16]:
idx2word_kor = {y:x for x,y in word2idx_kor.items()}
idx2word_eng = {y:x for x,y in word2idx_eng.items()}

In [17]:
input_file = "glove_kor.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

vocab_size_kor = len(word2idx_kor.keys())
embedding_size_kor = 100
weight_kor = np.zeros((vocab_size_kor, embedding_size_kor))
for i in range(4, vocab_size_kor):
    if idx2word_kor[i] in glove.key_to_index.keys():
        weight_kor[i] = glove[idx2word_kor[i]]

In [18]:
input_file = "glove_eng.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

vocab_size_eng = len(word2idx_eng.keys())
embedding_size_eng = 100
weight_eng = np.zeros((vocab_size_eng, embedding_size_eng))
for i in range(4, vocab_size_eng):
    if idx2word_eng[i] in glove.key_to_index.keys():
        weight_eng[i] = glove[idx2word_eng[i]]

In [19]:
def sent2idx(data, word2idx):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] in word2idx.keys():
                data[i][j] = word2idx[data[i][j]]
            else:
                data[i][j] = word2idx["UNK"]
    return data

In [20]:
X_train = sent2idx(X_train, word2idx_kor)
X_val = sent2idx(X_val, word2idx_kor)
y_train = sent2idx(y_train, word2idx_eng)
y_val = sent2idx(y_val, word2idx_eng)

In [21]:
def make_tensor(data, word2idx):
    max_length = 0
    length_list = []
    
    for i in data:
        length_list.append(len(i))
        if len(i) > max_length:
            max_length = len(i)
            
    for i in data:
        for _ in range(max_length-len(i)):
            i.append(word2idx["PAD"])
    
    data = torch.tensor(data.tolist())
    
    return torch.tensor(data), length_list

In [22]:
X_train_tensor = make_tensor(X_train, word2idx_kor)
X_val_tensor = make_tensor(X_val, word2idx_kor)
y_train_tensor = make_tensor(y_train, word2idx_eng)
y_val_tensor = make_tensor(y_val, word2idx_eng)

In [23]:
class CustomDataset(Dataset):
    def __init__(self, X_tensor, y_tensor):
        self.x = X_tensor[0]
        self.x_l = X_tensor[1]
        self.y = y_tensor[0]
        self.y_l = y_tensor[1]

    def __getitem__(self, index):
        return (self.x[index], self.x_l[index], self.y[index], self.y_l[index])

    def __len__(self):
        return len(self.x)

In [24]:
trainset = CustomDataset(X_train_tensor,  y_train_tensor)
valset = CustomDataset(X_val_tensor, y_val_tensor)

In [25]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)

In [26]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", device)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [27]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [28]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask): 
        _src, _ = self.self_attention(src, src, src, src_mask)

        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        _src = self.positionwise_feedforward(src)
        
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        return src

In [29]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        x = torch.matmul(self.dropout(attention), V)
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        x = self.fc_o(x)
        
        return x, attention

In [30]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [31]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        output = self.fc_out(trg)
            
        return output, attention

In [32]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        _trg = self.positionwise_feedforward(trg)

        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        return trg, attention

In [33]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        return src_mask
    
    def make_trg_mask(self, trg):
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        return trg_mask

    def forward(self, src, trg):
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        enc_src = self.encoder(src, src_mask)
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        return output, attention

In [34]:
INPUT_DIM = len(word2idx_kor)
OUTPUT_DIM = len(word2idx_eng)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [35]:
SRC_PAD_IDX = word2idx_kor["PAD"]
TRG_PAD_IDX = word2idx_eng["PAD"]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [36]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 17,875,865 trainable parameters


In [37]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [38]:
model.apply(initialize_weights);

In [39]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [40]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [41]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0].to(device)
        trg = batch[2].to(device)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])

        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)

        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [42]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0].to(device)
            trg = batch[2].to(device)
            
            output, _ = model(src, trg[:,:-1])
 
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [43]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0].to(device)
            trg = batch[2].to(device)

            output, _ = model(src, trg[:,:-1])

            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [44]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [46]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, trainloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 53s
	Train Loss: 3.696 | Train PPL:  40.274
	 Val. Loss: 2.832 |  Val. PPL:  16.978
Epoch: 02 | Time: 1m 55s
	Train Loss: 2.556 | Train PPL:  12.879
	 Val. Loss: 2.511 |  Val. PPL:  12.319
Epoch: 03 | Time: 1m 53s
	Train Loss: 2.145 | Train PPL:   8.541
	 Val. Loss: 2.395 |  Val. PPL:  10.970
Epoch: 04 | Time: 1m 51s
	Train Loss: 1.894 | Train PPL:   6.646
	 Val. Loss: 2.368 |  Val. PPL:  10.681
Epoch: 05 | Time: 1m 50s
	Train Loss: 1.718 | Train PPL:   5.576
	 Val. Loss: 2.360 |  Val. PPL:  10.596
Epoch: 06 | Time: 1m 50s
	Train Loss: 1.591 | Train PPL:   4.910
	 Val. Loss: 2.371 |  Val. PPL:  10.703
Epoch: 07 | Time: 1m 50s
	Train Loss: 1.491 | Train PPL:   4.439
	 Val. Loss: 2.388 |  Val. PPL:  10.888
Epoch: 08 | Time: 1m 50s
	Train Loss: 1.407 | Train PPL:   4.084
	 Val. Loss: 2.414 |  Val. PPL:  11.178
Epoch: 09 | Time: 1m 50s
	Train Loss: 1.335 | Train PPL:   3.799
	 Val. Loss: 2.432 |  Val. PPL:  11.380
Epoch: 10 | Time: 1m 50s
	Train Loss: 1.274 | Train PPL

In [47]:
model.load_state_dict(torch.load('transformer.pt'))

<All keys matched successfully>

In [48]:
def translate_sentence(sentence, word2idx_kor, word2idx_eng, model, device, max_len = 50):
    
    model.eval()
        
    tokens = tokenizer_kor(sentence)
        
    src_indexes = [word2idx_kor["<sos>"]] + [word2idx_kor[token] for token in tokens] + [word2idx_kor["<eos>"]]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [word2idx_eng["<sos>"]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == word2idx_eng["<eos>"]:
            break
    
    trg_tokens = [idx2word_eng[i] for i in trg_indexes]
    
    return trg_tokens[1:-1]

In [87]:
sentence = "번역 기계를 만드는 것은 쉬운 일이 아닙니다"
result = translate_sentence(sentence, word2idx_kor, word2idx_eng, model, device)

In [88]:
print(result)

['it', 'is', 'not', 'easy', 'to', 'make', 'translation', 'machines']
