<a href="https://colab.research.google.com/github/ideablast/NLPer_transformer_doc2vec_chatbot/blob/kdg/Load_model_complete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install konlpy
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git
!pip install git+https://github.com/ssut/py-hanspell.git

Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /tmp/pip-req-build-sc0vrxux
  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /tmp/pip-req-build-sc0vrxux
Collecting argparse>=1.4.0
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Building wheels for collected packages: pykospacing
  Building wheel for pykospacing (setup.py) ... [?25l[?25hdone
  Created wheel for pykospacing: filename=pykospacing-0.3-cp36-none-any.whl size=2255638 sha256=6b5e1bea054e5eca8c234ad72ad78137663e3aa02217fe1fdf374c1e723d55d3
  Stored in directory: /tmp/pip-ephem-wheel-cache-oxm9glwu/wheels/4d/45/58/e26cb2b7f6a063d234158c6fd1e5700f6e15b99d67154340ba
Successfully built pykospacing
Installing collected packages: argparse, pykospacing
Successfully installed argparse-1.4.0 pykospacing-0.3


Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-5ibbn124
  Running command git clone -q https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-5ibbn124
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-cp36-none-any.whl size=4854 sha256=a33961ddf032c76a591d52f04843a09e449de2be729258194c5a279c1b064a2c
  Stored in directory: /tmp/pip-ephem-wheel-cache-a6k3_qj2/wheels/0a/25/d1/e5e96476dbb1c318cc26c992dd493394fe42b0c204b3e65588
Successfully built py-hanspell
Installing collected packages: py-hanspell
Successfully installed py-hanspell-1.1


In [74]:
from keras import models
from keras import layers
from keras import optimizers, losses, metrics
from keras import preprocessing

import tensorflow as tf
import numpy as np
import re

from gensim.models import Doc2Vec
from konlpy.tag import Okt, Kkma
from hanspell import spell_checker
from pykospacing import spacing
import jpype

import pickle
import warnings
warnings.filterwarnings(action='ignore') 

In [5]:
# 태그 단어
PAD = "<PADDING>"   # 패딩
STA = "<START>"     # 시작
END = "<END>"       # 끝
OOV = "<OOV>"       # 없는 단어(Out of Vocabulary)

# 태그 인덱스
PAD_INDEX = 0
STA_INDEX = 1
END_INDEX = 2
OOV_INDEX = 3

# 데이터 타입
ENCODER_INPUT  = 0
DECODER_INPUT  = 1
DECODER_TARGET = 2

# Hyper-parameters for Transformer
NUM_LAYERS = 2                       # Encdoer, Decoder layer수(각각)
D_MODEL = 256                        # word embedding dimension
NUM_HEADS = 8                        # attention 헤드 수. D_Model % NUM_HEADS == 0이 되야 함!
UNITS = 512                          # FFNN 유닛수
DROPOUT = 0.1                        # dropout rate
EPOCHS = 50                          # Transformer, C,M Classification 에폭(에너르기폭발)
BATCH_SIZE = 64                      # Batch_size
BUFFER_SIZE = 1000                   # for data pipelining
# VOCAB_SIZE = 0                     # 단어사전이 보유한 단어의 개수. 후에 len(words) 로 바뀜.
max_sequences = 30                   # 한 문장에서 단어의 최대 개수
RE_FILTER = re.compile("[\"':;~()]") # 정규 표현식 필터

## functions
 - pos_tag([sentence]) : 형태소 분석+불용어제거된 문장 반환
 - convert_text_to_index([sentence], word_to_index, TYPE) : 정수인코딩+패딩된 문장을 반환, Type = 0:일반문장 1: 트랜스포머에 들어갈 문장, 2 : 트랜스포머에 들어갈 디코더 문장(학습할때만 이용)
 - grammar_checker(sentence) : 띄어쓰기, 문법교정된 문장 반환
 - show_prob_c(sentence) : 문장의 카테고리, 확률 반환
 - show_prob_m(sentence) : 문장의 의도, 확률 반환
 - Transformer_prediction(sentence) : 문장의 답변 반환
 - 

In [6]:
# 형태소분석 함수
def pos_tag(sentences):
    
    # KoNLPy 형태소분석기 설정
    tagger = Okt()
    
    # 문장 품사 변수 초기화
    sentences_pos = []
    
    # 모든 문장 반복
    for sentence in sentences:
        # [\"':;~()] 특수기호 제거
        sentence = re.sub(RE_FILTER, "", sentence)
        
        # 배열인 형태소분석의 출력을 띄어쓰기로 구분하여 붙임
        sentence = " ".join(tagger.morphs(sentence))
        sentences_pos.append(sentence)
        
    return sentences_pos

In [7]:
# 문장을 인덱스로 변환
def convert_text_to_index(sentences, vocabulary, type): 
    
    sentences_index = []
    
    # 모든 문장에 대해서 반복
    for sentence in sentences:
        sentence_index = []
        
        # 디코더 입력일 경우 맨 앞에 START 태그 추가
        if type == DECODER_INPUT:
            sentence_index.extend([vocabulary[STA]])
        
        # 문장의 단어들을 띄어쓰기로 분리
        for word in sentence.split():
            if vocabulary.get(word) is not None:
                # 사전에 있는 단어면 해당 인덱스를 추가
                sentence_index.extend([vocabulary[word]])
            else:
                # 사전에 없는 단어면 OOV 인덱스를 추가
                sentence_index.extend([vocabulary[OOV]])

        # 최대 길이 검사
        if type == DECODER_TARGET:
            # 디코더 목표일 경우 맨 뒤에 END 태그 추가
            if len(sentence_index) >= max_sequences:
                sentence_index = sentence_index[:max_sequences-1] + [vocabulary[END]]
            else:
                sentence_index += [vocabulary[END]]
        else:
            if len(sentence_index) > max_sequences:
                sentence_index = sentence_index[:max_sequences]
            
        # 최대 길이에 없는 공간은 패딩 인덱스로 채움
        sentence_index += (max_sequences - len(sentence_index)) * [vocabulary[PAD]]
        
        # 문장의 인덱스 배열을 추가
        sentences_index.append(sentence_index)

    return np.asarray(sentences_index)

In [None]:
def grammar_checker(sentence):

  spacing_sentence = spacing(sentence.replace(' ',''))
  spelled_sentence = spell_checker.check(spacing_sentence)
  checked_sentence = spelled_sentence.checked

  return checked_sentence

In [None]:
#input : '이 옷 다른 사이즈도 볼 수 있을까요?'
#output : ('의류', 1.0)
def show_prob_c(stc):
  list_stc = [stc]
  pos_stc = pos_tag(list_stc)
  index_stc = convert_text_to_index(pos_stc, word_to_index, 0).reshape(1,30)
  logits = c_model.predict(index_stc)

  index = np.argmax(logits)
  probability = np.max(logits)

  return index_to_category[index], probability

In [None]:
#input : '이 옷 다른 사이즈도 볼 수 있을까요?'
#output : ('치수문의', 0.7858745)
def show_prob_m(stc):
  list_stc = [stc]
  pos_stc = pos_tag(list_stc)
  index_stc = convert_text_to_index(pos_stc, word_to_index, 0).reshape(1,30)
  logits = m_model.predict(index_stc)

  index = np.argmax(logits)
  probability = np.max(logits)

  return index_to_main[index], probability

In [None]:
# input : '남성 바지는 어느 쪽에 있나요?'
# output : '저 뒤쪽 에 있어요'
def Transformer_prediction(stc):
  list_stc = [stc]
  pos_stc = pos_tag(list_stc)
  index_stc = convert_text_to_index(pos_stc, word_to_index, ENCODER_INPUT)
  input_seq = index_stc.squeeze()
  sentence = tf.expand_dims(input_seq, axis=0) # make tensor type
  output = tf.expand_dims([1], 0)

  for i in range(max_sequences):
    predictions = t_model.predict([sentence, output])
    # select the last word from the seq_len dimension
    predictions = predictions[:, -1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    if tf.equal(predicted_id, 2):
      break

    # concatenated the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

  output_indexes = tf.squeeze(output, axis=0)[1:].numpy()
  sentence = ''
  # 모든 문장에 대해서 반복
  for index in output_indexes:
      if index == END_INDEX:
          # 종료 인덱스면 중지
          break;
      if index_to_word.get(index) is not None:
          # 사전에 있는 인덱스면 해당 단어를 추가
          sentence += index_to_word[index]
      else:
          # 사전에 없는 인덱스면 OOV 단어를 추가
          sentence.extend([index_to_word[OOV_INDEX]])

      sentence += ' '
          
  return sentence

In [80]:
kkma = Kkma()
def tokenizer_kkma(doc):
    # 꼬꼬마 형태소 분석기가 자바 기반이어서 파이썬에서 자바함수들을 실행할 수 있는 명령어 (jpype) 를 써줘야한다.
    jpype.attachThreadToJVM()       
    token_doc = ["/".join(word) for word in kkma.pos(doc)]
    return token_doc

In [None]:
def doc2_answer(input_question):
  token_test = tokenizer_kkma(input_question)
  predict_vector = d2v_faqs.infer_vector(token_test)
  result = d2v_faqs.docvecs.most_similar([predict_vector],topn=1)
  return faqs[int(result[0][0])-1][2]

In [None]:
def score_calcul(left_cate,right_cate, num):#카테고리를 두개를 입력하면 유사도를 계산함, num : 의도면 1, 카테고리면 0
  result = 0
  if left_cate[0]==right_cate[0]:
      if round(abs(left_cate[1]-right_cate[1]),3) == 0:
          result+=1000
      else:
          result += (1/round(abs(left_cate[1]-right_cate[1]),3)) # 최대 999 이상 나올 수 없다.

  if num == 1 and result > 0:
      result+=500
  return result

In [None]:
# 문장을 입력으로 넣으면
# 최종 마지막 답변만 출력 되도록 변경
def result_final(stc):
    result = ""
    doc2_score=0
    tran_score=0

    q_category, q_cprob = show_prob_c(stc)
    q_main, q_mprob = show_prob_m(stc)

    T_answer = Transformer_prediction(stc)
    D_answer = doc2_answer(stc)  

    T_category, T_cprob = show_prob_c(T_answer)
    T_main, T_mprob = show_prob_m(T_answer)

    D_category, D_cprob = show_prob_c(D_answer)
    D_main, D_mprob = show_prob_m(D_answer)

    tran_score = score_calcul([q_category, q_cprob], [T_category, T_cprob], 0) + score_calcul([q_main, q_mprob], [T_main, T_mprob], 1)
    doc2_score = score_calcul([q_category, q_cprob], [D_category, D_cprob], 0) + score_calcul([q_main, q_mprob], [D_main, D_mprob], 1)

    if doc2_score > tran_score:
        result = grammar_checker(D_answer)
    elif tran_score > doc2_score:
        result = grammar_checker(T_answer)
    else:
        result = "잘모르겠습니다"

    return result

## custom function for Transformer model loading

In [8]:
## scaled dot product Attention
def scaled_dot_product_attention(query, key, value, mask):
  matmul_qk = tf.matmul(query, key, transpose_b=True) # QK^T

  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth) #  QK^T / sqrt(d_k)

  if mask is not None:
    logits += (mask * -1e9) # zero padding token softmax 결과가 0이 나오도록
  
  attention_weights = tf.nn.softmax(logits, axis = -1) # softmax(QK^T / sqrt(d_k))

  output = tf.matmul(attention_weights, value) # softmax(QK^T / sqrt(d_k)) * V

  return output

In [9]:
def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  # (batch_size, 1, 1, sequence length)
  return mask[:, tf.newaxis, tf.newaxis, :]

In [10]:
# it handle mask future tokens in a sequence used decoder. and mask pad tokens
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x)
  return tf.maximum(look_ahead_mask, padding_mask)

## Load models(Transformer, 2 bilstm) & dictionraies(6)

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
t_model = models.load_model('/content/drive/My Drive/Transformer_text_savedmodelform', compile=False)
m_model = models.load_model('/content/drive/My Drive/main_lstm_cl_test.h5')
c_model = models.load_model('/content/drive/My Drive/category_lstm_cl_test.h5')
d2v_faqs = Doc2Vec.load('/content/drive/My Drive/My_Doc2vec.model')

In [13]:
with open('/content/drive/My Drive/dictionary_list.pickle', 'rb') as handle:
  dictionary_list = pickle.load(handle)
with open('/content/drive/My Drive/data.pickle', 'rb') as f:
  faqs = pickle.load(f)

word_to_index = dictionary_list[0]
index_to_word = dictionary_list[1]
category_to_index = dictionary_list[2]
index_to_category = dictionary_list[3]
main_to_index = dictionary_list[4]
index_to_main = dictionary_list[5]

## Prediction

In [84]:
result_final("이 신발 얼마에요?")

'이게 6만 9천 원이에요'