In [2]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 490 kB/s eta 0:00:01    |██                              | 1.2 MB 4.5 MB/s eta 0:00:05     |██████▋                         | 4.0 MB 4.5 MB/s eta 0:00:04     |████████████                    | 7.3 MB 4.5 MB/s eta 0:00:03     |████████████████                | 9.7 MB 4.5 MB/s eta 0:00:03     |█████████████████               | 10.3 MB 4.5 MB/s eta 0:00:03
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.7 MB/s  eta 0:00:01
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.2.0-cp37-cp37m-manylinux2010_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 13.0 MB/s eta 0:00:01
Collecting tweepy>=3.7.0
  Downloading tweepy-3.10.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: tweepy, JPype1, beautifulsoup4, konlpy
  Attempting uninstall: be

In [3]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from konlpy.tag import Okt

FILTERS="([~,.!?\"':;)(])"
PAD = "<PAD>"
STD = "<SOS>"
END = "<END>"
UNK = "<UNK>"

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)

MAX_SEQUENCE = 25

def load_data(path):
    data_df = pd.read_csv(path, header=0)
    question, answer = list(data_df['Q']), list(data_df['A'])

    return question, answer

def data_tokenizer(data):
    words = []
    for sentence in data:
        sentence = re.sub(CHANGE_FILTER,"",sentence)
        for word in sentence.split():
            words.append(word)

    return [word for word in words if word]

def prepro_like_morphlized(data):
    morph_analyzer = Okt()
    result_data = list()
    for seq in data:
        morphlized_seq = morph_analyzer.morphs(seq.replace(' ',''))
        result_data.append(morphlized_seq)

    return result_data

def load_vocabulary(path, vocab_path):
    vocabulary_list = list()
    if not os.path.exists(vocab_path):
        if (os.path.exists(path)):
            data_df = pd.read_csv(path, encoding='utf-8')
            question, answer = list(data_df['Q']), list(data_df['A'])
            data = []
            data.extend(question)
            data.extend(answer)

            words = data_tokenizer(data)
            words = list(set(words))
            words[:0] = MARKER # HEAD APPEND

        with open(vocab_path, 'w',encoding='utf-8') as vocabulary_file:
            for word in words:
                vocabulary_file.write(word+'\n')

    with open(vocab_path,'r',encoding='utf-8') as vocabulary_file:
        for line in vocabulary_file:
            vocabulary_list.append(line.strip())
        
    word2idx, idx2word = make_vocabulary(vocabulary_list)

    return word2idx, idx2word, len(word2idx)

def make_vocabulary(vocabulary_list):
    word2idx = {w:i for i,w in enumerate(vocabulary_list)}
    idx2word = {i:w for i,w in enumerate(vocabulary_list)}

    return word2idx, idx2word

# word2idx, idx2word, vocab_size = load_vocabulary(PATH, VOCAB_PATH)

def enc_processing(value, dictionary):
    sequences_input_index = []
    sequences_length = []

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        sequence_index = []

        for word in sequence.split():
            if dictionary.get(word) is not None:
                sequence_index.extend([dictionary[word]])
            else:
                sequence_index.extend([dictionary[UNK]])

        if len(sequence_index) > MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE]
        
        sequences_length.append(len(sequences_length))

        # PADDING POST 
        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
        sequences_input_index.append(sequence_index)

    return np.asarray(sequences_input_index), sequences_length

def dec_output_processing(value, dictionary):
    sequences_output_index = []
    sequences_length = []

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        sequence_index = [dictionary[STD]] + [dictionary[word] for word in sequence.split()]

        if len(sequence_index) > MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE]
        sequences_length.append(len(sequence_index))
        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
        sequences_output_index.append(sequence_index)
    
    return np.asarray(sequences_output_index), sequences_length

def dec_target_processing(value, dictionary):
    sequences_target_index = []
    for sequence in value:
        sequence = re.sub(CHANGE_FILTER, "", sequence)
        sequence_index = [dictionary[word] for word in sequence.split()]
        
        if len(sequence_index) >= MAX_SEQUENCE:
            sequence_index = sequence_index[:MAX_SEQUENCE-1] + [dictionary[END]]
        else:
            sequence_index += [dictionary[END]]

        sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
        sequences_target_index.append(sequence_index)

    return np.asarray(sequences_target_index)


In [11]:
import argparse
import os
import glob
import sys
from tqdm import tqdm
# from preprocess import *


def preprocess_data(PATH, VOCAB_PATH):
    inputs, outputs = load_data(PATH)
    char2idx, idx2char, vocab_size = load_vocabulary(PATH, VOCAB_PATH)
    index_inputs, input_seq_len = enc_processing(inputs, char2idx)
    index_outputs, output_seq_len = dec_output_processing(outputs, char2idx)
    index_targets = dec_target_processing(outputs, char2idx)

    data_configs = {}
    data_configs['char2idx'] = char2idx
    data_configs['idx2char'] = idx2char
    data_configs['vocab_size'] = vocab_size
    data_configs['pad_symbol'] = PAD
    data_configs['std_symbol'] = STD
    data_configs['end_symbol'] = END
    data_configs['unk_symbol'] = UNK

    DATA_IN_PATH = './'
    TRAIN_INPUTS = 'train_inputs.npy'
    TRAIN_OUTPUTS = 'train_outputs.npy'
    TRAIN_TARGETS = 'traiN_targets.npy'
    DATA_CONFIGS = 'data_configs.json'

    np.save(open(DATA_IN_PATH+TRAIN_INPUTS,'wb'),index_inputs)
    np.save(open(DATA_IN_PATH+TRAIN_OUTPUTS,'wb'),index_outputs)
    np.save(open(DATA_IN_PATH+TRAIN_TARGETS,'wb'),index_targets)

    json.dump(data_configs, open(DATA_IN_PATH+DATA_CONFIGS,'w'))
    print(len(input_seq_len),len(output_seq_len))
    

preprocess_data('../input/kot-chat/ChatbotData .csv','./vocabulary.txt')   

11823 11823


In [13]:
import tensorflow as tf
import numpy as np
import os

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

#from preprocessing import *
DATA_IN_PATH = './'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'traiN_targets.npy'
DATA_CONFIGS = 'data_configs.json'

def plot_graphs(history,string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string],'')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legned([string,'val_'+string])
    plt.show()
    
SEED_NUM = 103
tf.random.set_seed(SEED_NUM)

index_inputs = np.load(open(DATA_IN_PATH+TRAIN_INPUTS,'rb'))
index_outputs = np.load(open(DATA_IN_PATH+TRAIN_OUTPUTS,'rb'))
index_targets = np.load(open(DATA_IN_PATH+TRAIN_TARGETS,'rb'))
prepro_configs = json.load(open(DATA_IN_PATH+DATA_CONFIGS,'r'))

print(len(index_inputs),len(index_outputs),len(index_targets))


11823 11823 11823


In [17]:
MODEL_NAME = 'seq2seq_kor'
BATCH_SIZE = 2
MAX_SEQUENCE = 25
EPOCH = 30
UNITS = 1024
EMBEDDING_DIM = 256
VALIDATION_SPLIT = .1

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol']
end_index = prepro_configs['end_symbol']
vocab_size = prepro_configs['vocab_size']


In [18]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initalizer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initalize_hidden_state(self, inp):
        return tf.zeros((tf.shape(inp)[0], self.enc_units))
    

In [None]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V  = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query,1)
        # Consider different Score metrics & Visualizing Attetion score when input come in 
        score = self.V(tf.nn.tanh(self.W1(values)+self.W2(hidden_with_tim_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector,axis=1)
        return context_vector, attention_weights
        