In [5]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from konlpy.tag import Okt

In [6]:
FILTERS = "([~.,!?\"':;(])"
PAD = '<PAD>'
STD = '<SOS>'
END = '<END>'
UNK = '<UNK>'

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)

MAX_SEQUENCE=25

# 데이터 로딩

In [28]:
def load_data(path):
    data_df = pd.read_csv(path, header=0)
    q, a = list(data_df['Q']), list(data_df['A'])
    return q, a

inputs, outputs = load_data(PATH)

# vocabulary 구축

In [29]:
def data_tokenizer(sentences):
    """
    문장 전처리 후 단어 리스트 리턴  (voca 만들때 사용)
    """
    
    words = []
    for sentence in sentences:
        sentence = re.sub(CHANGE_FILTER, "", sentence)
        for word in sentence.split():
            words.append(word)
    return [word for word in words if word]

def prepare_like_morphlized(sentences):
    """
    형태소 형태로 준비
    """
    
    okt = Okt()
    result = []
    for sentence in tqdm(sentences):
        morphed_sentence = ' '.join(okt.morphs(sentence.replace(' ', '')))
        result.append(morphed_sentence)
    return result

def make_vocabulary(vocabulary_list):
    word2idx = {word:idx for idx, word in enumerate(vocabulary_list)}
    idx2word = {idx:word for idx, word in enumerate(vocabulary_list)}
    
    return word2idx, idx2word

def load_vocabulary(data_path, vocab_path, tokenize_as_morph=False):
    
    # 기 생성 voca가 없으면 생성
    if not os.path.exists(vocab_path):
        if(os.path.exists(data_path)):
            q, a = load_data(data_path)
            
            # 형태소 형태로 처리할 것인가?
            if tokenize_as_morph:
                q = prepare_like_morphlized(q)
                a = prepare_like_morphlized(a)
        
        data = []
        data.extend(q)
        data.extend(a)
        words = data_tokenizer(data)
        words = list(set(words))
        words[:0] = MARKER
                
        with open(vocab_path, 'w', encoding='utf-8') as vocab_file:
            for word in words:
                vocab_file.write(word+'\n')
    
    vocab_list = []
    with open(vocab_path, 'r', encoding='utf-8') as vocab_file:
        for line in vocab_file:
            vocab_list.append(line.strip())
    word2idx, idx2word = make_vocabulary(vocab_list)
    
    return word2idx, idx2word, len(word2idx)

In [30]:
PATH = './chatbot.csv'
VOCAB_PATH = './vocabulary.txt'
    
word2idx, idx2word, vocab_size = load_vocabulary(PATH, VOCAB_PATH, tokenize_as_morph=False)

In [31]:
list(word2idx.items())[:10]

[('<PAD>', 0),
 ('<SOS>', 1),
 ('<END>', 2),
 ('<UNK>', 3),
 ('참았는데', 4),
 ('끝내더라', 5),
 ('솔직히', 6),
 ('역할을', 7),
 ('이상하다', 8),
 ('사귀겠대', 9)]

# 인코더 입력 만들기

In [36]:
def enc_processing(sentences, dictionary, tokenize_as_morph=False):
    """
    각 문장에 dictionary를 적용해서 integer sequence로 바꾼다.
    """
    
    sentences_input_index = []
    sentences_length = [] # 실제 문장별 길이
    
    if tokenize_as_morph:
        sentences = prepare_like_morphlized(sentences)
    
    for sentence in sentences:
        sentence = re.sub(CHANGE_FILTER, "", sentence)
        sentence_index = []
        for word in sentence.split():
            if dictionary[word] is not None:
                sentence_index.extend([dictionary[word]])
            else:
                sentence_index.extend([dictionary[UNK]])
        
        if len(sentence_index) > MAX_SEQUENCE:  # truncate beyond max length
            sentence_index = sentence_index[:MAX_SEQUENCE]
            
        sentences_length.append(len(sentence_index))   # PADDING 전 실제 문장 길이 측정
        sentence_index += (MAX_SEQUENCE-len(sentence_index)) * [dictionary[PAD]]   # PAD IF shorter than MAX_LEN
        
        sentences_input_index.append(sentence_index)
        
    return np.array(sentences_input_index), sentences_length

index_inputs, input_seq_len = enc_processing(
    inputs, word2idx, tokenize_as_morph=False)

In [38]:
index_inputs.shape

(11823, 25)

In [41]:
index_inputs[1,:]

array([11828, 16701,   780,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

In [42]:
inputs[1]

'1지망 학교 떨어졌어'

# 디코더 입력/타겟  만들기

<code>
  디코더 입력값: SOS    그래        오랜만이야   PAD
  디코더 타켓값: 그래   오랜만이야   END         PAD
<code>

In [47]:
def dec_output_processing(sentences, dictionary, tokenize_as_morph=False):

    sentences_output_index = []
    sentences_length = [] # 실제 문장별 길이
    
    if tokenize_as_morph:
        sentences = prepare_like_morphlized(sentences)
    
    for sentence in sentences:
        sentence = re.sub(CHANGE_FILTER, "", sentence)
        sentence_index = []
        
        # start with <SOS>
        sentence_index = [dictionary[STD]] + [dictionary[word] if word in dictionary else dictionary[UNK] 
                                              for word in sentence.split()]
        
        if len(sentence_index) > MAX_SEQUENCE:  # truncate beyond max length
            sentence_index = sentence_index[:MAX_SEQUENCE]
            
        sentences_length.append(len(sentence_index))   # PADDING 전 실제 문장 길이 측정
        sentence_index += (MAX_SEQUENCE-len(sentence_index)) * [dictionary[PAD]]   # PAD IF shorter than MAX_LEN
        
        sentences_output_index.append(sentence_index)
        
    return np.array(sentences_output_index), sentences_length    
    
def dec_target_processing(sentences, dictionary, tokenize_as_morph=False):

    sentences_target_index = []
    
    if tokenize_as_morph:
        sentences = prepare_like_morphlized(sentences)
    
    for sentence in sentences:
        sentence = re.sub(CHANGE_FILTER, "", sentence)
        sentence_index = []
    
        sentence_index =[dictionary[word] if word in dictionary else dictionary[UNK] for word in sentence.split()]
    
        # end with <END>
        if len(sentence_index) >= MAX_SEQUENCE:  
            sentence_index = sentence_index[:MAX_SEQUENCE-1] + [dictionary[END]]
        else:
            sentence_index += [dictionary[END]]
            
        sentence_index += (MAX_SEQUENCE-len(sentence_index)) * [dictionary[PAD]]   # PAD IF shorter than MAX_LEN
        sentences_target_index.append(sentence_index)
        
    return np.array(sentences_target_index)        

index_outputs, output_seq_len = dec_output_processing(outputs, word2idx, tokenize_as_morph=False)
index_targets = dec_target_processing(outputs, word2idx, tokenize_as_morph=False)

In [48]:
index_outputs[1,:]

array([    1, 13479, 20262,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

In [50]:
outputs[1]

'위로해 드립니다.'

In [51]:
index_targets[1,:]

array([13479, 20262,     2,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

# config 구성

In [53]:
data_configs = {}
data_configs['word2idx'] = word2idx
data_configs['idx2word'] = idx2word
data_configs['vocab_size'] = vocab_size
data_configs['pad_symbol'] = PAD
data_configs['std_symbol'] = STD
data_configs['end_symbol'] = END
data_configs['unk_symbol'] = UNK

# 학습 데이터 저장

In [55]:
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

np.save(open(TRAIN_INPUTS, 'wb'), index_inputs)
np.save(open(TRAIN_OUTPUTS , 'wb'), index_outputs)
np.save(open(TRAIN_TARGETS , 'wb'), index_targets)

json.dump(data_configs, open(DATA_CONFIGS, 'w'))