# Lab-12-7-bonus: sequence to sequence with attention chatbot 
### simple neural machine translation chatbot

* sequence to sequence|
* variable input sequence length
* variable output sequence length
* Luong attention
  
### Reference
* [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)
* [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025)
* [Neural Machine Translation with Attention from Tensorflow](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb)

In [1]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf

tf.enable_eager_execution() # jupyter처럼 즉시 확인 가능

import matplotlib.pyplot as plt # 그래프 시각화 
from sklearn.model_selection import train_test_split # sklearn을 이용한 train/test split

import unicodedata
import re
import numpy as np
import os
import time

print(tf.__version__)

1.14.0


## Chat data load

In [13]:
pip install --upgrade --user pip

Collecting pip
  Using cached https://files.pythonhosted.org/packages/30/db/9e38760b32e3e7f40cce46dd5fb107b8c73840df38f0046d8e6514e675a1/pip-19.2.3-py2.py3-none-any.whl
Installing collected packages: pip
Successfully installed pip-19.2.3
Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install konlpy

Collecting konlpy
  Using cached https://files.pythonhosted.org/packages/e5/3d/4e983cd98d87b50b2ab0387d73fa946f745aa8164e8888a714d5129f9765/konlpy-0.5.1-py2.py3-none-any.whl
Collecting JPype1>=0.5.7 (from konlpy)
  Downloading https://files.pythonhosted.org/packages/d3/08/f4bb58c1c0dff93e9628cd0e1025f80fcb5a4551310455feb96b96e58ad1/JPype1-0.7.0-cp37-cp37m-win_amd64.whl (1.2MB)
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-0.7.0 konlpy-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
from konlpy.tag import Twitter
import pandas as pd
import enum
import os
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [17]:
# 질문과 답변내용을 학습 
def loadData():
    # 판다스를 통해서 데이터를 불러온다.
    dataDF = pd.read_csv('./data_in/chat_ko/ChatBotData.csv', header=0)
    # 질문과 답변 열을 가져와 question과 answer에 넣는다.
    question, answer = list(dataDF['Q']), list(dataDF['A'])
#     dataset = dataDF['Q'] + '\t' + dataDF['A']
#     dataset = list(dataset)
    # skleran에서 지원하는 함수를 통해서 학습 셋과 
    # 테스트 셋을 나눈다.
#     xTrain, xTest, yTrain, yTest = train_test_split(question, answer, test_size=0.33, random_state=42)
    # 그 값을 리턴한다.
#     return xTrain, yTrain, xTest, yTest
    return question, answer

def preproLikeMorphlized(data):
    # 형태소 분석 모듈 객체를
    # 생성합니다.

    morphAnalyzer = Twitter()
    # 형태소 토크나이즈 결과 문장을 받을
    #  리스트를 생성합니다.
    result_data = list()
    # 데이터에 있는 매 문장에 대해 토크나이즈를
    # 할 수 있도록 반복문을 선언합니다.
    for seq in data:
        # Twitter.morphs 함수를 통해 토크나이즈 된
        # 리스트 객체를 받고 다시 공백문자를 기준으로
        # 하여 문자열로 재구성 해줍니다.
        morphlizedSeq = " ".join(morphAnalyzer.morphs(seq.replace(' ', '')))
        result_data.append(morphlizedSeq)

    return result_data

def dataTokenizer(data):
    # 토크나이징 해서 담을 배열 생성
    words = []
    for sentence in data:
        # FILTERS = "([~.,!?\"':;)(])"
        # 위 필터와 같은 값들을 정규화 표현식을 
        # 통해서 모두 "" 으로 변환 해주는 부분이다.
        sentence = re.sub(change_filter, "", sentence)
        for word in sentence.split():
            words.append(word)
    # 토그나이징과 정규표현식을 통해 만들어진 
    # 값들을 넘겨 준다.
    return [word for word in words if word]

def preprocess_sentence(w):
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w

def create_dataset(data, num_examples):
#     word_pairs = [[preprocess_sentence(w) for w in l]  for l in data[:num_examples]]
    word_pairs = [[preprocess_sentence(l)]  for l in data[:num_examples]]
    return word_pairs

class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
#             if self.vocab == '<start>':
#                 pass
            self.vocab.update(phrase.split(' '))
    
        self.vocab = sorted(self.vocab)
    
        self.word2idx['<pad>'] = 0
#         self.word2idx['<start>'] = 1
#         self.word2idx['<end>'] = 2
#         self.word2idx['<unk>'] = 3

        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word
            
def max_length(tensor):
    return max(len(t) for t in tensor)

In [18]:
def load_dataset(path, num_examples):
    
    # 판다스를 통해서 데이터를 불러온다.
    data_df = pd.read_csv(path, header=0)
    # 질문과 답변 열을 가져와 question과 answer에 넣는다.
    question, answer = list(data_df['Q']), list(data_df['A'])
    
    question = create_dataset(question, num_examples)
    answer = create_dataset(answer, num_examples)
    
    print(len(question))
    
    pairs = [question[i] + answer[i] for i in range(num_examples)]
    
    # index language using the class defined above    
    inp_lang = LanguageIndex(q for q, a in pairs)
    targ_lang = LanguageIndex(a for q, a in pairs)
    
    # Vectorize the input and target languages
    
    # question
    input_tensor = [[inp_lang.word2idx[s] for s in q.split(' ')] for q, a in pairs]
    
    # answer
    target_tensor = [[targ_lang.word2idx[s] for s in a.split(' ')] for q, a in pairs]

    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar    

In [19]:
# Load dataset with limit

num_examples = 11823

input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset('./data_in/ChatBotData.csv', num_examples)

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

FileNotFoundError: [Errno 2] File b'./data_in/ChatBotData.csv' does not exist: b'./data_in/ChatBotData.csv'