In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from sklearn.model_selection import train_test_split

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from konlpy.tag import Mecab

import random

import time

import numpy as np

import math

import nltk
from nltk.tokenize import word_tokenize

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df_korean = pd.read_csv("./korean.csv", encoding="utf-8-sig")
df_eng = pd.read_csv("./english.csv", encoding="utf-8-sig")

In [5]:
df_eng.head()

Unnamed: 0,번역문
0,how is the market is reaction to the newly rel...
1,the sales increase is faster than the previous...
2,then we will have to call the manufacturer and...
3,sure i will make a call and double the volume ...
4,shall we take a look at the issues we discusse...


In [6]:
df = pd.concat([df_korean, df_eng], axis=1)

In [7]:
df.head()

Unnamed: 0,원문,번역문
0,이번 신제품 출시에 대한 시장의 반응은 어떤가요,how is the market is reaction to the newly rel...
1,판매량이 지난번 제품보다 빠르게 늘고 있습니다,the sales increase is faster than the previous...
2,그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요,then we will have to call the manufacturer and...
3,네 제가 연락해서 주문량을 2배로 늘리겠습니다,sure i will make a call and double the volume ...
4,지난 회의 마지막에 논의했던 안건을 다시 볼까요,shall we take a look at the issues we discusse...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   원문      100000 non-null  object
 1   번역문     100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [9]:
dataset = np.array(df)

In [10]:
trainset, valset= train_test_split(dataset, test_size=0.1)

In [11]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]

In [12]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer_kor(text):
    return m.morphs(text)

def tokenizer_eng(text):
    return word_tokenize(text)

In [13]:
word2idx_kor = {}
word2idx_kor["PAD"] = 0
word2idx_kor["UNK"] = 1
word2idx_kor["<sos>"] = 2
word2idx_kor["<eos>"] = 3
word2idx_eng = {}
word2idx_eng["PAD"] = 0
word2idx_eng["UNK"] = 1
word2idx_eng["<sos>"] = 2
word2idx_eng["<eos>"] = 3

In [14]:
count = 4

for i in range(len(X_train)):
    X_train[i] = tokenizer_kor(X_train[i])
    for token in X_train[i]:
        if token not in word2idx_kor.keys():
            word2idx_kor[token] = count
            count += 1
    X_train[i] = ["<sos>"] + X_train[i] + ["<eos>"]
    
for i in range(len(X_val)):
    X_val[i] = tokenizer_kor(X_val[i])
    X_val[i] = ["<sos>"] + X_val[i] + ["<eos>"]

In [15]:
count = 4

for i in range(len(y_train)):
    y_train[i] = tokenizer_eng(y_train[i])
    for token in y_train[i]:
        if token not in word2idx_eng.keys():
            word2idx_eng[token] = count
            count += 1
    y_train[i] = ["<sos>"] + y_train[i] + ["<eos>"]

for i in range(len(y_val)):
    y_val[i] = tokenizer_eng(y_val[i])
    y_val[i] = ["<sos>"] + y_val[i] + ["<eos>"]

In [16]:
idx2word_kor = {y:x for x,y in word2idx_kor.items()}
idx2word_eng = {y:x for x,y in word2idx_eng.items()}

In [17]:
input_file = "glove_kor.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

vocab_size_kor = len(word2idx_kor.keys())
embedding_size_kor = 100
weight_kor = np.zeros((vocab_size_kor, embedding_size_kor))
for i in range(4, vocab_size_kor):
    if idx2word_kor[i] in glove.key_to_index.keys():
        weight_kor[i] = glove[idx2word_kor[i]]

In [18]:
input_file = "glove_eng.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

vocab_size_eng = len(word2idx_eng.keys())
embedding_size_eng = 100
weight_eng = np.zeros((vocab_size_eng, embedding_size_eng))
for i in range(4, vocab_size_eng):
    if idx2word_eng[i] in glove.key_to_index.keys():
        weight_eng[i] = glove[idx2word_eng[i]]

In [19]:
def sent2idx(data, word2idx):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] in word2idx.keys():
                data[i][j] = word2idx[data[i][j]]
            else:
                data[i][j] = word2idx["UNK"]
    return data

In [20]:
X_train = sent2idx(X_train, word2idx_kor)
X_val = sent2idx(X_val, word2idx_kor)
y_train = sent2idx(y_train, word2idx_eng)
y_val = sent2idx(y_val, word2idx_eng)

In [21]:
def make_tensor(data, word2idx):
    max_length = 0
    length_list = []
    
    for i in data:
        length_list.append(len(i))
        if len(i) > max_length:
            max_length = len(i)
            
    for i in data:
        for _ in range(max_length-len(i)):
            i.append(word2idx["PAD"])
    
    data = torch.tensor(data.tolist())
    
    return torch.tensor(data), length_list

In [22]:
X_train_tensor = make_tensor(X_train, word2idx_kor)
X_val_tensor = make_tensor(X_val, word2idx_kor)
y_train_tensor = make_tensor(y_train, word2idx_eng)
y_val_tensor = make_tensor(y_val, word2idx_eng)

In [23]:
class CustomDataset(Dataset):
    def __init__(self, X_tensor, y_tensor):
        self.x = X_tensor[0]
        self.x_l = X_tensor[1]
        self.y = y_tensor[0]
        self.y_l = y_tensor[1]

    def __getitem__(self, index):
        return (self.x[index], self.x_l[index], self.y[index], self.y_l[index])

    def __len__(self):
        return len(self.x)

In [24]:
trainset = CustomDataset(X_train_tensor,  y_train_tensor)
valset = CustomDataset(X_val_tensor, y_val_tensor)

In [25]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)

In [26]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", device)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [27]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first = True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, length):
        
        embedded = self.dropout(self.embedding(src))
        
        packed_input = pack_padded_sequence(embedded, length.tolist(), batch_first=True, enforce_sorted=False)
        packed_output,(hidden, cell) = self.rnn(packed_input)
        outputs, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        
        return outputs, hidden, cell

In [28]:
class Attention(nn.Module):

    def __init__(self, dimensions, attention_type='general'):
        super(Attention, self).__init__()

        if attention_type not in ['dot', 'general']:
            raise ValueError('Invalid attention type selected.')

        self.attention_type = attention_type
        if self.attention_type == 'general':
            self.linear_in = nn.Linear(dimensions, dimensions, bias=False)

        self.linear_out = nn.Linear(dimensions * 2, dimensions, bias=False)
        self.softmax = nn.Softmax(dim=-1)
        self.tanh = nn.Tanh()

    def forward(self, query, context):

        batch_size, output_len, dimensions = query.size()
        query_len = context.size(1)

        if self.attention_type == "general":
            query = query.reshape(batch_size * output_len, dimensions)
            query = self.linear_in(query)
            query = query.reshape(batch_size, output_len, dimensions)

        # TODO: Include mask on PADDING_INDEX?

        # (batch_size, output_len, dimensions) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, query_len)
        attention_scores = torch.bmm(query, context.transpose(1, 2).contiguous())

        # Compute weights across every context sequence
        attention_scores = attention_scores.view(batch_size * output_len, query_len)
        attention_weights = self.softmax(attention_scores)
        attention_weights = attention_weights.view(batch_size, output_len, query_len)

        # (batch_size, output_len, query_len) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, dimensions)
        mix = torch.bmm(attention_weights, context)

        # concat -> (batch_size * output_len, 2*dimensions)
        combined = torch.cat((mix, query), dim=2)
        combined = combined.view(batch_size * output_len, 2 * dimensions)

        # Apply linear_out on every 2nd dimension of concat
        # output -> (batch_size, output_len, dimensions)
        output = self.linear_out(combined).view(batch_size, output_len, dimensions)
        output = self.tanh(output)

        return output, attention_weights

In [29]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first = True)
        
        self.att = Attention(hid_dim)
        
        self.fc_out = nn.Linear(hid_dim*2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, en_outputs, hidden, cell):
        
        input = input.unsqueeze(1)
        
        embedded = self.dropout(self.embedding(input))
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        att_value, _ =  self.att(output, en_outputs)
        
        concat = torch.cat([output, att_value], dim=-1)
        
        prediction = self.fc_out(concat.squeeze(1))
        
        return prediction, hidden, cell

In [30]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, src_length, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        en_outputs, hidden, cell = self.encoder(src, src_length)
        
        #first input to the decoder is the <sos> tokens
        input = trg[:, 0]
        
        for t in range(1, trg_len):

            output, hidden, cell = self.decoder(input, en_outputs, hidden, cell)
            
            outputs[:, t, :] = output

            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
            
            input = trg[:, t] if teacher_force else top1
        
        return outputs

In [31]:
INPUT_DIM = len(word2idx_kor)
OUTPUT_DIM = len(word2idx_eng)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

enc.embedding.weight.data.copy_(torch.tensor(weight_kor))
dec.embedding.weight.data.copy_(torch.tensor(weight_eng))

model = Seq2Seq(enc, dec, device).to(device)

In [32]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20472, 100)
    (rnn): LSTM(100, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(16767, 100)
    (rnn): LSTM(100, 512, num_layers=2, batch_first=True, dropout=0.5)
    (att): Attention(
      (linear_in): Linear(in_features=512, out_features=512, bias=False)
      (linear_out): Linear(in_features=1024, out_features=512, bias=False)
      (softmax): Softmax(dim=-1)
      (tanh): Tanh()
    )
    (fc_out): Linear(in_features=1024, out_features=16767, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [33]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 28,413,947 trainable parameters


In [34]:
optimizer = optim.Adam(model.parameters())

In [35]:
TRG_PAD_IDX = word2idx_eng["PAD"]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [36]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0].to(device)
        src_length = batch[1].to(device)
        trg = batch[2].to(device)
        
        optimizer.zero_grad()
        
        output = model(src, src_length, trg)
        
        output_dim = output.shape[-1]
        
        output = output[:, 1:, :].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [37]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0].to(device)
            src_length = batch[1].to(device)
            trg = batch[2].to(device)

            output = model(src, src_length, trg, 0)

            output_dim = output.shape[-1]
            
            output = output[:, 1:, :].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [38]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [39]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, trainloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq_with_att.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 9m 18s
	Train Loss: 5.185 | Train PPL: 178.552
	 Val. Loss: 5.273 |  Val. PPL: 195.069
Epoch: 02 | Time: 9m 17s
	Train Loss: 4.320 | Train PPL:  75.188
	 Val. Loss: 4.935 |  Val. PPL: 139.133
Epoch: 03 | Time: 9m 17s
	Train Loss: 3.896 | Train PPL:  49.216
	 Val. Loss: 4.817 |  Val. PPL: 123.642
Epoch: 04 | Time: 9m 16s
	Train Loss: 3.614 | Train PPL:  37.111
	 Val. Loss: 4.725 |  Val. PPL: 112.764
Epoch: 05 | Time: 9m 17s
	Train Loss: 3.391 | Train PPL:  29.688
	 Val. Loss: 4.640 |  Val. PPL: 103.540
Epoch: 06 | Time: 9m 17s
	Train Loss: 3.231 | Train PPL:  25.312
	 Val. Loss: 4.631 |  Val. PPL: 102.586
Epoch: 07 | Time: 9m 16s
	Train Loss: 3.080 | Train PPL:  21.768
	 Val. Loss: 4.742 |  Val. PPL: 114.679
Epoch: 08 | Time: 9m 16s
	Train Loss: 2.976 | Train PPL:  19.599
	 Val. Loss: 4.648 |  Val. PPL: 104.409
Epoch: 09 | Time: 9m 16s
	Train Loss: 2.878 | Train PPL:  17.773
	 Val. Loss: 4.728 |  Val. PPL: 113.100
Epoch: 10 | Time: 9m 17s
	Train Loss: 2.799 | Train PPL

In [40]:
model.load_state_dict(torch.load('seq2seq_with_att.pt'))

<All keys matched successfully>

In [41]:
def translate_sentence(sentence, word2idx_kor, word2idx_eng, model, device, max_len = 50):

    model.eval()
        
    tokens = tokenizer_kor(sentence)
    
    src_indexes = [word2idx_kor["<sos>"]] + [word2idx_kor[token] for token in tokens] + [word2idx_kor["<eos>"]] 
    
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    src_len = torch.LongTensor([len(src_indexes)]).to(device)
    
    with torch.no_grad():
        en_outputs, hidden, cell = model.encoder(src_tensor, src_len)
        
    trg_indexes = [word2idx_eng["<sos>"]]
    
    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
                
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, en_outputs, hidden, cell)
            
        pred_token = output.argmax(1).item()
        
        trg_indexes.append(pred_token)

        if pred_token == word2idx_eng["<eos>"]:
            break
            
    trg_tokens = [idx2word_eng[i] for i in trg_indexes]
    
    return trg_tokens[1:-1]

In [68]:
sentence = "내일 출장을 다녀와야 합니다"
result = translate_sentence(sentence, word2idx_kor, word2idx_eng, model, device)

In [69]:
print(result)

['i', 'have', 'to', 'go', 'to', 'the', 'business', 'trip', 'tomorrow']
