In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tarfile
import numpy as np
import re
import codecs

from scipy.spatial import distance
import json

Using TensorFlow backend.


In [2]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

def parse_stories(lines, only_supporting=False):
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

def get_stories(f, only_supporting=False, max_length=None):
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data

def vectorize_stories(data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen),
            np.array(answers))

In [3]:
data = 'aurabot'
direc = './'
train_stories = get_stories(codecs.open(direc + data + '.txt', 'r', 'utf-8'))

vocab = set()
for story, q, answer in train_stories:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories)))

print('-')
print('단어장 크기 :', vocab_size, '중복없는 단어')
print('스토리 길이 :', story_maxlen, '단어')
print('질문 :', query_maxlen, '단어')
print('학습 스토리 개수:', len(train_stories))
print('-')
print('데이터 셋은 다음처럼 구성됨 (스토리, 질의, 답변):')
print(train_stories[0])
print('-')

word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories)
print('스토리 : 벡터크기', inputs_train.shape)
print('질문 : 벡터크기', queries_train.shape)
print('답변 : (1 또는 0)로 구성된 벡터 크기', answers_train.shape)
print('-')

-
단어장 크기 : 44 중복없는 단어
스토리 길이 : 3 단어
질문 : 5 단어
학습 스토리 개수: 117
-
데이터 셋은 다음처럼 구성됨 (스토리, 질의, 답변):
(['UNK'], ['안녕', '하세요', '?'], '인사1')
-
스토리 : 벡터크기 (117, 3)
질문 : 벡터크기 (117, 5)
답변 : (1 또는 0)로 구성된 벡터 크기 (117,)
-


  return _compile(pattern, flags).split(string, maxsplit)


In [4]:
# 모델
input_sequence = Input((story_maxlen,))
question = Input((query_maxlen,))

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.3))

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=query_maxlen))
input_encoder_c.add(Dropout(0.3))

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen))
question_encoder.add(Dropout(0.3))

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

response = add([match, input_encoded_c])
response = Permute((2, 1))(response)

answer = concatenate([response, question_encoded])
answer = LSTM(32)(answer)
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)

model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [5]:
model.fit([inputs_train, queries_train], answers_train, batch_size=1, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.callbacks.History at 0x131f7f080>

In [6]:
#----- 결과 도출 함수 -----
def vocab_result(x, vocab):
    if x.argmax() != 0: return vocab[int(x.argmax())-1]
    else: return False
    
def v_s(data, s_m):
    a = []
    for i in data:
        if i == None:
            a.append(word_idx['UNK'])
        else:
            a.append(word_idx[i])
    return (pad_sequences([a], maxlen=s_m))

def ref_result(result1, result2, mode='detail'):
    a = distance.euclidean(result1[0], result2[0])
    c =  vocab_result(result2[0], vocab)
    d = max(result2[0])
    if c != False:
        b = c
    if mode == 'detail':
        print('detail_ref / acc :', a, '/', max(result2[0]))
    
    if mode == 'simple' or mode == 'detail':
        print('연관도 :', a)
        print('정확도 :', d)
        if b:
            print('정답 :', ''.join(b))
        else:
            print('답변없음')
    return a, d

def answer_result(result1, threshold=0.9):
    a, b = [], []
    x = vocab_result(result1[0], vocab)
    if x != False:
        if max(result1[0]) > threshold:
            a.append(x)
            a.append(" ")
            a.append(max(result1[0]))
            a.append(" ")
            b.append(x)
            b.append(" ")
        else:
            return False, False
    
    if a != []:
        return ''.join([str(i) for i in a]), ''.join([str(i) for i in b])
    else:
        return False, False

In [7]:
#답변 불러오기
with codecs.open('./answers.json', 'r', encoding='UTF-8') as json_data:
    answers_data = json.load(json_data)['answers']
json_data.close()
an = 1  # 샘플로 처음 데이터를 불러옴
print('Sample : ['+answers_data[an-1]['ID']+']', answers_data[an-1]['AN'])

Sample : [인사말] 안녕하세요. 저는 아우라봇입니다. <br/>아직 공부하고 있는 중이라 할 수 있는 건 별로 없지만 <br/>앞으로 잘 알려 주시면 열심히 공부할게요. :D <br/>지금 할 수 있는 것은 아래와 같습니다. <br/>1. 인사 <br/>2.봇 소개 <br/>3.아우라 팀 소개


In [8]:
#대화 초기화
Input_Data = []
threshold = 0.9      #0.9
ref_threshold = 1.5      #1.5

In [9]:
#맥락 별 중복해서 맥락 간 정확도 체크
Q = input('사용자질문 > ')

try:
    q_data = tokenize(Q)
    ref_data,Input_Data_A = [], []
    m_acc = []
    results = []
    result_no = 0
    final_answer = None
    
    #맥락 flatten
    Input_Data_F = [y for x in Input_Data for y in x]
    
    #맥락 없음
    print('맥락정보 : 없음')
    results.append(model.predict([v_s(tokenize('UNK'), story_maxlen), v_s(q_data, query_maxlen)]))
    _, a = ref_result(results[0], results[0], mode='simple')     #detail / simple / none
    m_acc.append(a)
    
    result_no += 1
    print()

    #각 맥락 별 결과 비교값
    j = []
    for i in Input_Data:
        j.extend(i)
        print('맥락정보 :', end="")
        for f in j:
            print(vocab[f-1], end=" ")
        print()
        
        result = model.predict([pad_sequences([j], story_maxlen, truncating='post'), v_s(q_data, query_maxlen)])
        a, b = ref_result(results[0], result, mode='simple')     #detail / simple / none
        
        if a < ref_threshold:
            results.append(result)
            m_acc.append(b)
            result_no += 1
        print()
    
    #순위 선정하기
    rank_1 = np.argmax(m_acc)

    for i in range(rank_1):
        ref_data.append(Input_Data[i])

    #무맥락 답변
    print('무맥락답변 :', end=' ')
    an_no, f_an_no = answer_result(results[0], threshold=threshold)
    if an_no == False:
        an_no, f_an_no = '무슨 뜻인지 모르겠어요', '무슨 뜻인지 모르겠어요'
    print(an_no)

    #최적맥락 답변
    print('최적맥락답변 :', end=' ')
    an_A, f_an_A = answer_result(results[rank_1], threshold=threshold)
    
    if an_A == False:
        an_A, f_an_A = '무슨 뜻인지 모르겠어요', '무슨 뜻인지 모르겠어요'
    print(an_A)

    if ref_data == []:
        final_answer = f_an_no
    else:
        final_answer = f_an_A

    #이전 대화 저장하기
    x_d = []
    for i in v_s(q_data, query_maxlen):
        for j in reversed(i):
            if j != 0:
                x_d.insert(0, j)

    Input_Data = ref_data + [x_d]

    #종합
    print("=================================")
    print('질문 :',q_data)
    print('전체맥락 :', end=" ")
    for i in Input_Data_F:
        print(vocab[i-1], end=" ")
    print()
    for i in answers_data:
        if i['ID'] == final_answer.strip():
            final_answer = i['AN']      
    print("\033[1m\033[31m최종답변 :", final_answer)
     
except KeyError:
    print('※ 사전에 있는 단어를 입력해 주세요.')
    print(vocab)

사용자질문 > 안녕 ?
맥락정보 : 없음
연관도 : 0.0
정확도 : 0.9999999
정답 : 인사1

무맥락답변 : 인사1 0.9999999 
최적맥락답변 : 인사1 0.9999999 
질문 : ['안녕', '?']
전체맥락 : 
[1m[31m최종답변 : 안녕하세요. 아우라봇이에요 :)


  return _compile(pattern, flags).split(string, maxsplit)
