In [1]:
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Reading package lists...
Reading package lists...
Building dependency tree...
Rea

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import re
import json

import numpy as np
import pandas as pd
from tqdm import tqdm

from konlpy.tag import Okt

pd.options.display.max_columns = 100

In [4]:
def data_load(path):
  data_df = pd.read_csv(path) # header=0
  question, answer = list(data_df['Q']), list(data_df['A'])

  return question, answer

In [5]:
def data_tokenizer(data):
  words = []
  for sentence in data:
    # 특수문자 제거
    sentence = re.sub(CHANGE_FILTER,"", sentence)
    # 공백을 기준으로 나눈다
    for word in sentence.split():
      words.append(word)

  return [word for word in words if word] # == words

In [6]:
def morphlized(data):
  okt = Okt()
  result_data = list()
  for seq in tqdm(data):
    morphlized_seq = " ".join(okt.morphs(seq.replace(' ','')))
    result_data.append(morphlized_seq)

  return result_data

In [7]:
# 단어 사전
def load_voca(path, voca_path, tokenize_as_morph=False):
  voca_list = list() # == []
  
  if not os.path.exists(voca_path):
    if (os.path.exists(path)):
      data_df = pd.read_csv(path, encoding='utf-8')
      question, answer = list(data_df['Q']), list(data_df['A'])

      if tokenize_as_morph:
        question = morphlized(question)
        answer = morphlized(answer)

      data = []
      data.extend(question)
      data.extend(answer)
      words = data_tokenizer(data)
      words = list(set(words))
      words[:0] = MARKER

    with open(voca_path, 'w', encoding='utf-8') as voca_file:
      for word in words:
        voca_file.write(word + '\n')
  
  with open(voca_path, 'r', encoding='utf-8') as voca_file:
    for line in voca_file:
      voca_list.append(line.strip())
  word2idx, idx2word = make_voca(voca_list)

  return word2idx, idx2word, len(word2idx)

In [8]:
def make_voca(voca_list):
  
  word2idx = {word:idx for idx, word in enumerate(voca_list)}
  idx2word = {idx:word for idx, word in enumerate(voca_list)}
  
  return word2idx, idx2word

In [9]:
# vlaue : 데이터, dictionary : 사전
def enc_processing(value, dictionary, tokenize_as_morph=False):
  sequence_input_index = []
  sequence_length = []

  if tokenize_as_morph:
    value = morphlized(value)

  for sequence in value:
      sentence = re.sub(CHANGE_FILTER,"", sequence)
      sequence_idx = []
      for word in sentence.split():
        if dictionary.get(word) is not None:
          sequence_idx.extend([dictionary[word]])
        else:
          sequence_idx.extend([dictionary[UNK]])

      if len(sequence_idx) > MAX_SEQUENCE:
        sequence_idx = sequence_idx[:MAX_SEQUENCE]

      sequence_length.append(len(sequence_idx))
      sequence_idx += (MAX_SEQUENCE - len(sequence_idx)) * [dictionary[PAD]]

      sequence_input_index.append(sequence_idx)

  return np.asarray(sequence_input_index), sequence_length

In [10]:
def dec_input_processing(value, dictionary, tokenize_as_morph=False):
  sequence_input_idx = []
  sequence_length = []

  if tokenize_as_morph:
    value = morphlized(value)
  
  for sequence in value:
    sentence = re.sub(CHANGE_FILTER,"", sequence)
    sequence_idx = []
    sequence_idx = [dictionary[STD]]+[dictionary[word] if word in dictionary else dictionary[UNK] for word in sentence.split()]

    if len(sequence_idx) > MAX_SEQUENCE:
        sequence_idx = sequence_idx[:MAX_SEQUENCE]
    
    sequence_length.append(len(sequence_idx))
    sequence_idx += (MAX_SEQUENCE - len(sequence_idx)) * [dictionary[PAD]]

    sequence_input_idx.append(sequence_idx)

  return np.asarray(sequence_input_idx), sequence_length

In [11]:
def dec_target_processing(value, dictionary, tokenize_as_morph=False):
  sequence_target_idx = []
  
  if tokenize_as_morph:
    value = morphlized(value)

  for sequence in value:
    sentence = re.sub(CHANGE_FILTER,"", sequence)
    sequence_idx = [dictionary[word] if word in dictionary else dictionary[UNK] for word in sentence.split()]

    if len(sequence_idx) >= MAX_SEQUENCE:
      sequence_idx = sequence_idx[:MAX_SEQUENCE-1]+[dictionary[END]]
    else:
      sequence_idx += [dictionary[END]]

    sequence_idx += (MAX_SEQUENCE-len(sequence_idx))*[dictionary[PAD]]
    sequence_target_idx.append(sequence_idx)

  return np.asarray(sequence_target_idx)

In [12]:
DIR = "drive/MyDrive/Implementation/Attention/ChatbotData.csv"
VOCA_DIR = "drive/MyDrive/Implementation/Attention/Dictionary.txt"

FILTERS = "([~.,!?\"':;)()])"
PAD = '<PAD>'
STD = '<SOS>'
END = '<END>'
UNK = '<UNK>'

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)

MAX_SEQUENCE = 30

In [13]:
inputs, outputs = data_load(DIR)
word2idx, idx2word, voca_size = load_voca(DIR, VOCA_DIR)

In [14]:
idx_inputs, input_seq_len = enc_processing(inputs, word2idx)
idx_outputs, output_seq_len = dec_input_processing(outputs, word2idx)
idx_targets = dec_target_processing(outputs, word2idx)

In [15]:
data_configs = {}
data_configs['word2idx'] = word2idx
data_configs['idx2word'] = idx2word
data_configs['voca_size'] = voca_size
data_configs['pad_symbol'] = 'PAD'
data_configs['std_symbol'] = 'STD'
data_configs['end_symbol'] = 'END'
data_configs['unk_symbol'] = 'UNK'

In [16]:
PATH = "drive/MyDrive/Implementation/Attention/"
TRAIN_INPUT = 'train_input.npy'
TRAIN_OUTPUT = 'train_output.npy'
TRAIN_TARGET = 'train_target.npy'
DATA_CONFIGS = 'data_configs.json'

np.save(open(PATH+TRAIN_INPUT, 'wb'), idx_inputs)
np.save(open(PATH+TRAIN_OUTPUT, 'wb'), idx_outputs)
np.save(open(PATH+TRAIN_TARGET, 'wb'), idx_targets)

json.dump(data_configs, open(PATH+DATA_CONFIGS, 'w'))

In [17]:
# !pip install gensim

In [18]:
# from gensim import models
# from gensim.models import FastText, KeyedVectors

# model = KeyedVectors.load_word2vec_format(DIR+'cc.ko.300.vec', limit=30000)

In [19]:
# 토큰 길이
# token_len = [len(i) for i in train.context_token]