In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR
    
import math
import time
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings(action='ignore')

SEED = 1234
BATCH_SIZE = 256
MAX_WORD_LENGTH_IN_SENT = 25
MAX_CHAR_LENGTH_IN_WORD = 6

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'

In [61]:
from torchtext import data, datasets
BATCH_SIZE = 10
MAX_WORD_LENGTH_IN_SENT = 20

In [62]:
import os
import numpy as np
from konlpy.tag import Mecab
from torchtext import data, datasets
from sklearn.model_selection import train_test_split

def generate_source_and_target(lines, split_cond, fpath="data"):
    src = []
    trg = []

    for line in lines:
        src.append(line[:-1] + '\n')
        trg.append(line[1:] + '\n')
    
    write_txt(split_cond + ".src", src, fpath)
    write_txt(split_cond + ".trg", trg, fpath)
    
def write_txt(fname, lines, fpath):
    with open(os.path.join(fpath, fname), "w") as f:
        f.writelines(lines)

with open("data/petitions_splited_mecab.txt", "r") as f:
     corpus = f.readlines()

corpus = list(map(lambda x: str(x).replace("\n", ""), corpus))
train_lines, test_lines = train_test_split(corpus, test_size=0.05, random_state=1234)
train_lines, valid_lines = train_test_split(train_lines, test_size=1/19, random_state=1234)

generate_source_and_target(train_lines, "train", fpath="data")
generate_source_and_target(valid_lines, "val", fpath="data")
generate_source_and_target(test_lines, "test", fpath="data")

In [63]:
class ELMODataset:
    def __init__(self, filepath, batch_size, max_length, device):
        self.batch_size = batch_size
        self.device = device

        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              unk_token='<unk>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False,
                              fix_length=max_length)
        
        self.SRC_ = data.Field(tokenize=list,
                              batch_first=True, 
                              include_lengths=False)
        
        self.SRC_CHAR = data.NestedField(self.SRC_, 
                                         init_token="", 
                                         eos_token="",
                                         fix_length=20)
        
        self.TRG = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              unk_token='<unk>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False,
                              fix_length=max_length)

        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=((('src', 'src_char'), (self.SRC, self.SRC_CHAR)), 
                                                      (('trg'), (self.TRG))))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device)
 
    def build_vocab(self, min_freq=2):
        self.SRC.build_vocab(self.train_data, min_freq=min_freq)
        self.SRC_CHAR.build_vocab(self.train_data)
        self.TRG.build_vocab(self.train_data, min_freq=min_freq)
        
        print(f"Unique tokens in source vocabulary: {len(self.SRC.vocab)}")
        print(f"Unique tokens in target vocabulary: {len(self.TRG.vocab)}")

elmo_dataset = ELMODataset(filepath="data", batch_size=BATCH_SIZE, max_length=MAX_WORD_LENGTH_IN_SENT, device=device)

Unique tokens in source vocabulary: 36630
Unique tokens in target vocabulary: 37335
number of training data : 205654
number of valid data : 11426
number of test data : 11426


In [64]:
elmo_dataset.SRC.vocab.itos

['<unk>',
 '<pad>',
 '<sos>',
 '<eos>',
 '이',
 '하',
 '는',
 '을',
 '고',
 '에',
 '의',
 '은',
 '를',
 '가',
 '들',
 '있',
 '도',
 '한',
 '.',
 '으로',
 '습니다',
 '다',
 '지',
 '것',
 ',',
 '게',
 '되',
 '에서',
 '로',
 '주',
 '없',
 '수',
 '만',
 '적',
 '어',
 '과',
 '해',
 '않',
 '면',
 '일',
 '기',
 '국민',
 '할',
 '인',
 '합니다',
 '입니다',
 '년',
 '받',
 '시',
 '었',
 '여',
 '했',
 '사람',
 '말',
 '안',
 '그',
 '보',
 '와',
 '에게',
 '저',
 '생각',
 '1',
 '겠',
 '는데',
 '나',
 '다고',
 '아',
 '라고',
 '원',
 '나라',
 '등',
 '며',
 '습니',
 '같',
 '제',
 '까지',
 '세요',
 '2',
 '지만',
 '정부',
 '라',
 '우리',
 '많',
 '아니',
 '더',
 '자',
 '해야',
 '님',
 '니',
 '라는',
 '된',
 '3',
 '해서',
 '면서',
 '월',
 '거',
 '내',
 '지금',
 '때',
 '이런',
 '문제',
 '서',
 '았',
 '다는',
 '였',
 '대통령',
 '살',
 '법',
 '던',
 '돈',
 '알',
 '국가',
 '좋',
 '분',
 '전',
 '대한',
 '중',
 '으면',
 '합니',
 '…',
 '시간',
 '도록',
 '못',
 '이상',
 '대',
 '대한민국',
 '왜',
 '그리고',
 '때문',
 '이나',
 '입니',
 '명',
 '및',
 '만들',
 '너무',
 '번',
 '성',
 '달',
 '청원',
 '세금',
 '사',
 '는지',
 '부터',
 '아이',
 '정책',
 '싶',
 '위해',
 '사회',
 '보다',
 '필요',
 '10',
 '4',
 '한다',
 '5'

In [65]:
elmo_dataset.SRC_CHAR.vocab.itos

['<unk>',
 '<pad>',
 '',
 '이',
 '다',
 '는',
 '하',
 '고',
 '니',
 '을',
 '지',
 '에',
 '가',
 '의',
 '도',
 '한',
 '사',
 '로',
 '서',
 '기',
 '들',
 '해',
 '은',
 '대',
 '를',
 '시',
 '자',
 '인',
 '정',
 '어',
 '아',
 '있',
 '나',
 '국',
 '원',
 '라',
 '으',
 '수',
 '리',
 '게',
 '부',
 '주',
 '제',
 '만',
 '습',
 '그',
 '요',
 '0',
 '면',
 '1',
 '보',
 '일',
 '민',
 '.',
 '무',
 '여',
 '것',
 '입',
 '상',
 '세',
 '전',
 '과',
 '장',
 '금',
 '2',
 '생',
 ',',
 '되',
 '적',
 '없',
 '합',
 '공',
 '안',
 '야',
 '소',
 '구',
 '신',
 '문',
 '데',
 '당',
 '저',
 '경',
 '위',
 '거',
 '동',
 '조',
 '까',
 '성',
 '비',
 '청',
 '용',
 '우',
 '할',
 '행',
 '년',
 '내',
 '말',
 '법',
 '업',
 '관',
 '회',
 '실',
 '재',
 '중',
 '건',
 '않',
 '와',
 '모',
 '려',
 '유',
 '간',
 '통',
 '분',
 '치',
 '계',
 '마',
 '각',
 '연',
 '차',
 '개',
 '받',
 '산',
 '등',
 '방',
 '직',
 '진',
 '화',
 '했',
 '3',
 '명',
 '스',
 '오',
 '었',
 '미',
 '선',
 '교',
 '더',
 '급',
 '발',
 '람',
 '바',
 '처',
 '러',
 '히',
 '며',
 '단',
 '불',
 '못',
 '학',
 '런',
 '현',
 '때',
 '5',
 '체',
 '심',
 '같',
 '권',
 '물',
 '임',
 '많',
 '결',
 '월',
 '드',
 '8',
 '운',
 '된

In [66]:
for batch in elmo_dataset.train_iterator:
    print(batch)
    break


[torchtext.data.batch.Batch of size 10]
	[.trg]:[torch.LongTensor of size 10x20]
	[.src]:[torch.LongTensor of size 10x20]
	[.src_char]:[torch.LongTensor of size 10x20x4]


In [67]:
class simple_model(nn.Module):
    def __init__(self, elmo_dataset):
        super(simple_model, self).__init__()
        self.embedding = nn.Embedding(len(elmo_dataset.SRC_CHAR.vocab.itos), 200, padding_idx=1)
        self.fc = nn.Linear(200, len(elmo_dataset.TRG.vocab.itos))
        
    def forward(self, src):
        output = self.fc(self.embedding(src))
        
        return output

model = simple_model(elmo_dataset).to(device)   

In [91]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
torch.set_printoptions(edgeitems=100)

i = 0
for batch in elmo_dataset.train_iterator:
    if i >= 5:
        break
    elif i < 4:
        pass
    print(batch.src_char.shape)
    print(batch.src_char)
    pad_seq = torch.LongTensor([pad_sequences(word, maxlen=6, padding='post', value=1) for word in batch.src_char])
    print(model(pad_seq).shape)
    i += 1


torch.Size([10, 20, 4])
tensor([[[   2,    1,    1,    1],
         [  64,    1,    1,    1],
         [   3,   93,    1,    1],
         [ 180,    1,    1,    1],
         [  42,   63,    1,    1],
         [   2,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,    1]],

        [[   2,    1,    1,    1],
         [ 460,    1,    1,    1],
         [  43,    1,    1,    1],
         [  32,  131,    1,    1],
         [ 264,   46,    1,    1],
         [   2,    1,    1,    1],
         [   1,    1,    1,    1],
         [   1,    1,    1,  

torch.Size([10, 20, 6, 37335])
torch.Size([10, 20, 4])
tensor([[[  2,   1,   1,   1],
         [450,   1,   1,   1],
         [ 38, 130, 209,   1],
         [ 50, 666,   1,   1],
         [ 44,   8,  86,   1],
         [  2,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1]],

        [[  2,   1,   1,   1],
         [183,   1,   1,   1],
         [152,   1,   1,   1],
         [ 94,   1,   1,   1],
         [ 84,  72,   1,   1],
         [  2,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1,   1,   1,   1],
         [  1

In [49]:
char_count = 0

for char in elmo_dataset.SRC.vocab.itos[4:]:
    if len(char) > char_count:
        char_count = len(char)
        print(char)

char_count

이
으로
습니다
대한민국
https
부탁드립니다
youtube
articleview
isyeonhapflash
rankingsectionid
2017072702102269052001


22