In [3]:
import os
import numpy as np
from konlpy.tag import Mecab
from torchtext import data, datasets
from sklearn.model_selection import train_test_split

def generate_source_and_target(lines, split_cond):
    src = []
    trg = []

    for line in lines:
        src.append(' '.join(line[:-1]) + '\n')
        trg.append(' '.join(line[1:]) + '\n')
    
    write_txt(split_cond + ".src", src)
    write_txt(split_cond + ".trg", trg)
    
def write_txt(fname, lines, fpath="data"):
    if fpath is not None:
        with open(os.path.join(fpath, fname), "w") as f:
            f.writelines(lines)
    elif fpath is None:
        with open(fname, "w") as f:
            f.writelines(lines)

if os.path.exists("data/train.src"):
    with open("data/petitions_splited_mecab.txt", "r") as f:
         corpus = f.readlines()

    corpus = list(map(lambda x: str(x).replace("\n", ""), corpus))

    train_lines, test_lines = train_test_split(corpus, test_size=0.05, random_state=1234)
    train_lines, valid_lines = train_test_split(train_lines, test_size=1/19, random_state=1234)

    generate_source_and_target(train_lines, "train")
    generate_source_and_target(valid_lines, "val")
    generate_source_and_target(test_lines, "test")

In [67]:
class ELMODataset:
    def __init__(self, filepath, batch_size, device):
        self.batch_size = batch_size
        self.device = device

        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False)
        
        self.TRG = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              fix_length=20)

        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device)
 
    def build_vocab(self, min_freq=5):
        self.SRC.build_vocab(self.train_data, min_freq=min_freq)
        self.TRG.build_vocab(self.train_data, min_freq=min_freq)
        
        print(f"Unique tokens in source vocabulary: {len(self.SRC.vocab)}")
        print(f"Unique tokens in target vocabulary: {len(self.TRG.vocab)}")
        
elmo_dataset = ELMODataset(filepath="data", batch_size=256, device='cpu')

Unique tokens in source vocabulary: 1610
Unique tokens in target vocabulary: 1602
number of training data : 205654
number of valid data : 11426
number of test data : 11426


In [68]:
class CharacterDecomposer:
    def __init__(self, elmo_dataset, batch_size, max_word_in_sent, max_char_in_word, special_token_idx=[0, 1, 2, 3]):
        self.elmo_dataset = elmo_dataset
        self.batch_size = batch_size
        self.max_word_in_sent = max_word_in_sent
        self.max_char_in_word = max_char_in_word
        self.special_token_idx = special_token_idx
        
        self.build_char_vocab()
        
    def build_char_vocab(self):
        char_vocab = set([char for word in self.elmo_dataset.SRC.vocab.itos for char in word])
        self.ctoi = {}
        self.itoc = {}
        
        for idx, char in enumerate(char_vocab):
            self.ctoi[char] = idx
            self.itoc[idx]  = char
            
    def decompose(self, src):
        # pad token이 1로 되어 있음
        batch_char_embedding = np.ones((self.batch_size, self.max_word_in_sent, self.max_char_in_word)).astype(int)
        
        for batch_order_idx, sent in enumerate(src):
            for word_order_idx, s in enumerate(sent):
                if word_order_idx < self.max_word_in_sent - 1:
                    if s in self.special_token_idx:
                        batch_char_embedding[batch_order_idx, word_order_idx, 0] = s
                        pass
                    elif s not in self.special_token_idx:
                        for char_order_idx, char in enumerate(self.elmo_dataset.SRC.vocab.itos[s]):
                            if char_order_idx < self.max_char_in_word - 1:
                                batch_char_embedding[batch_order_idx, word_order_idx, char_order_idx] = self.ctoi[char]
                                                             
        return torch.LongTensor(batch_char_embedding)
    
character_decomposer = CharacterDecomposer(elmo_dataset, batch_size=256, max_word_in_sent=20, max_char_in_word=6)

In [69]:
for batch in elmo_dataset.train_iterator:
    src = batch.src
    print(character_decomposer.decompose(src).shape)

    break

torch.Size([256, 20, 6])


In [70]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR

import torchtext

from torchtext.data.utils import get_tokenizer
from torchtext.data import Dataset, Field, BucketIterator
    
import math
import time
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from collections import defaultdict, Counter

import warnings
warnings.filterwarnings(action='ignore')

SEED = 1234
BATCH_SIZE = 256

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [71]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, pad_idx, dropout=0.2):
        super(CNN1d, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels  = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size  = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):       
        embedded = self.embedding(text)
        batch_size, word_len, char_len, emb_dim = embedded.size()
        
        # [batch * word_len, char_len, emb_dim]
        embedded = embedded.reshape(-1, char_len, emb_dim)  
        embedded = embedded.permute(0, 2, 1)
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat    = self.dropout(torch.cat(pooled, dim=1))
        
        print(f"cat: {cat.shape}")
        
        output = self.fc(cat)
        output = output.reshape(batch_size, word_len, -1)
        
        return output
    
class Highway(nn.Module):
    def __init__(self, size, n_layers, f):
        super(Highway, self).__init__()

        self.n_layers = n_layers
        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.f = f

    def forward(self, x):
        for layer in range(self.n_layers):
            gate = F.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x
    
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, pad_idx, n_layers=2, bidirectional=True):
        super(ELMO_Embedding, self).__init__()

        n_filters = 100
        filter_sizes = [3, 4, 5]

        self.embedding = CNN1d(vocab_size, emb_dim, n_filters, filter_sizes, emb_dim, pad_idx)
        self.highway   = Highway(size=emb_dim, n_layers=1, f=F.relu)
        self.rnn       = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=bidirectional)        
        self.fc_out    = nn.Linear(hid_dim, output_dim)

    def forward(self, src):
        embedding               = self.embedding(src)
        highway                 = self.highway(embedding)

        output, (hidden, state) = self.rnn(highway)

        batch_size, seq_len, _  = output.size()
        output                  = output.reshape(batch_size, seq_len, -1, 2)

        forward_hid, backward_hid = output[:, :, :, 0], output[:, :, :, 1]
        
        forward_pred  = self.fc_out(forward_hid)
        backward_pred = self.fc_out(backward_hid)

        return forward_pred, backward_pred

In [105]:
def train(model, iterator, optimizer, criterion, output_dim, character_decomposer, device, clip=1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(iterator): 
        src = batch.src
        src = character_decomposer.decompose(src).to(device)
        trg = batch.trg.to(device).reshape(-1).long()
        
        optimizer.zero_grad()
        fpred, bpred = model(src)
        output_dim = fpred.shape[-1]
        
        fpred = fpred.reshape(-1, output_dim)
        bpred = fpred.reshape(-1, output_dim)

        forward_loss  = criterion(fpred, trg)
        backward_loss = criterion(bpred, trg)
        loss = forward_loss + backward_loss
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion, output_dim, character_decomposer, device):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in iterator:
            src = batch.src
            src = character_decomposer.decompose(src).to(device)
            trg = batch.trg.to(device).reshape(-1).long()

            fpred, bpred = model(src)
            output_dim = fpred.shape[-1]

            fpred = fpred.reshape(-1, output_dim)
            bpred = fpred.reshape(-1, output_dim)
            
            forward_loss  = criterion(fpred, trg)
            backward_loss = criterion(bpred, trg)
            loss = forward_loss + backward_loss

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [106]:
CHAR_VOCAB_SIZE = len(character_decomposer.ctoi)
WORD_VOCAB_SIZE = len(elmo_dataset.TRG.vocab.stoi)
EMB_DIM         = 200
HID_DIM         = 512
PAD_IDX         = elmo_dataset.SRC.vocab.stoi['<pad>']

model     = ELMO_Embedding(CHAR_VOCAB_SIZE, EMB_DIM, HID_DIM, WORD_VOCAB_SIZE, PAD_IDX, n_layers=2, bidirectional=True)
model     = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [107]:
N_EPOCHS  = 1
PAITIENCE = 10

n_paitience = 0
best_valid_loss = float('inf')
optimizer.zero_grad()
optimizer.step()

for epoch in range(N_EPOCHS):
    train_loss = train(model, elmo_dataset.train_iterator, optimizer, criterion, WORD_VOCAB_SIZE, character_decomposer, device)
    valid_loss = evaluate(model, elmo_dataset.valid_iterator, criterion, WORD_VOCAB_SIZE, character_decomposer, device)

    print(f'Epoch: {epoch + 1:02}')
    print(f'Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'Valid Loss: {valid_loss:.3f} | Train PPL: {math.exp(valid_loss):7.3f}')

    if n_paitience < PAITIENCE:
        if best_valid_loss > valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'ELMO-LM_best.pt')
            n_paitience = 0
        elif best_valid_loss <= valid_loss:
            n_paitience += 1
    else:
        print("Early stop!")
        model.load_state_dict(torch.load('ELMO-LM_best.pt'))
        break

  0%|          | 0/804 [00:00<?, ?it/s]

cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])
cat: torch.Size([5120, 300])
embedding: torch.Size([256, 20, 200])


KeyboardInterrupt: 