In [8]:
import os
import numpy as np
from konlpy.tag import Mecab
import torch
from torchtext import data, datasets
from sklearn.model_selection import train_test_split

In [9]:
def generate_source_and_target(lines, split_cond):
    src = []
    trg = []

    for line in lines:
        src.append(' '.join(line[:-1]) + '\n')
        trg.append(' '.join(line[1:]) + '\n')
    
    write_txt(split_cond + ".src", src)
    write_txt(split_cond + ".trg", trg)
    
def write_txt(fname, lines, fpath="data"):
    if fpath is not None:
        with open(os.path.join(fpath, fname), "w") as f:
            f.writelines(lines)
    elif fpath is None:
        with open(fname, "w") as f:
            f.writelines(lines)

In [10]:
with open("data/petitions_splited_mecab.txt", "r") as f:
     corpus = f.readlines()

corpus = list(map(lambda x: str(x).replace("\n", ""), corpus))

In [11]:
train_lines, test_lines = train_test_split(corpus, test_size=0.05, random_state=1234)
train_lines, valid_lines = train_test_split(train_lines, test_size=1/19, random_state=1234)

In [12]:
generate_source_and_target(train_lines, "train")
generate_source_and_target(valid_lines, "val")
generate_source_and_target(test_lines, "test")

In [13]:
BATCH_SIZE = 256
MAX_WORD_LENGTH_IN_SENT = 25
MAX_CHAR_LENGTH_IN_WORD = 6

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
class ELMODataset:
    def __init__(self, filepath, batch_size, max_length, device):
        self.batch_size = batch_size
        self.device = device

        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False,
                              fix_length=max_length)
        
        self.TRG = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              fix_length=max_length)

        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device)
 
    def build_vocab(self, min_freq=5):
        self.SRC.build_vocab(self.train_data, min_freq=min_freq)
        self.TRG.build_vocab(self.train_data, min_freq=min_freq)
        
        print(f"Unique tokens in source vocabulary: {len(self.SRC.vocab)}")
        print(f"Unique tokens in target vocabulary: {len(self.TRG.vocab)}")

elmo_dataset = ELMODataset(filepath="data", batch_size=BATCH_SIZE, max_length=MAX_WORD_LENGTH_IN_SENT, device=device)


Unique tokens in source vocabulary: 1610
Unique tokens in target vocabulary: 1602
number of training data : 205654
number of valid data : 11426
number of test data : 11426


In [17]:
class CharacterDecomposer:
    def __init__(self, elmo_dataset, max_word_in_sent, max_char_in_word, special_token_idx=[0, 1, 2, 3]):
        self.elmo_dataset = elmo_dataset
        self.max_word_in_sent = max_word_in_sent
        self.max_char_in_word = max_char_in_word
        self.special_token_idx = special_token_idx
        
        self.build_char_vocab()
        
    def build_char_vocab(self):
        char_vocab = set([char for word in self.elmo_dataset.SRC.vocab.itos for char in word])
        self.ctoi = {}
        self.itoc = {}
        
        for idx, char in enumerate(char_vocab):
            self.ctoi[char] = idx
            self.itoc[idx]  = char
            
    def decompose(self, src):
        # pad token이 1로 되어 있음
        batch_char_embedding = np.ones((src.shape[0], self.max_word_in_sent, self.max_char_in_word)).astype(int)
        
        for batch_order_idx, sent in enumerate(src):
            for word_order_idx, s in enumerate(sent):
                if word_order_idx < self.max_word_in_sent - 1:
                    if s in self.special_token_idx:
                        batch_char_embedding[batch_order_idx, word_order_idx, 0] = s
                        pass
                    elif s not in self.special_token_idx:
                        for char_order_idx, char in enumerate(self.elmo_dataset.SRC.vocab.itos[s]):
                            if char_order_idx < self.max_char_in_word - 1:
                                batch_char_embedding[batch_order_idx, word_order_idx, char_order_idx] = self.ctoi[char]
                                                             
        return torch.LongTensor(batch_char_embedding)
    
character_decomposer = CharacterDecomposer(elmo_dataset, max_word_in_sent=MAX_WORD_LENGTH_IN_SENT, max_char_in_word=MAX_CHAR_LENGTH_IN_WORD)


In [18]:
%%timeit
for batch in elmo_dataset.train_iterator:
    src = batch.src
    character_decomposer.decompose(src)
#     print(character_decomposer.decompose(src))
    break

606 ms ± 5.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
