# 1. Import libraries

In [None]:
import tensorflow as tf
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt


keras = tf.keras
t = Okt()

# 2. fit tokenizer to our datasets

In [None]:
from vectorizer import BaseVectorizer
tokenizer = BaseVectorizer(t.morphs)

In [None]:
# 데이터 셋 불러온다
df = pd.read_csv('train_entity.csv',encoding='CP949')

In [None]:
# 잘 불러와졌나 확인
df[0:1]

In [None]:
# csv 파일의 1행1열의 값을 넣으면 된다
# 이 코드에서 불러온 train_entity는 1행1열이 word임
tokenizer.fit(df['word'].values)

In [None]:
tokenizer.get_char2idx()

In [None]:
# 글자마다 인덱스 줌
wordvoca = tokenizer.char2idx

In [None]:
wordvoca

# 3. data preprocessing

In [None]:
# entity의 레이블과 id를 변환해주는 딕셔너리
label_to_id = {t:i for i,t in enumerate(df.entity.unique())}
id_to_label = {i:t for i,t in enumerate(df.entity.unique())}

In [None]:
print(label_to_id)

In [None]:
# 글자단위로 자른다. 최대 길이 10 짜리 리스트를 만든다
# 글자가 '사과' 라면 2글자 이므로 나머지 8칸에는 _PAD_ 를 넣는다
MAX_LENGTH = 10
def tokenize_by_char(words,labels):
    inputs , outputs = [], []
    for word,label in zip(words,labels):
        tempword = []
        tempnum = []
        for i in range(len(word)):
            tempword.append(word[i])
        for i in tempword:
            tempnum.append(wordvoca[i])
        if len(tempnum) <= MAX_LENGTH:
            inputs.append(tempnum)
            outputs.append(label_to_id[label])
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=MAX_LENGTH, padding='post', 
        value = tokenizer.char2idx['_PAD_']) # value = 0
  
    return padded_inputs, outputs
            

In [None]:
# 글자를 아까 만든 wordvoca에서의 인덱스 번호로 바꾼다
def decode_num_char(wordvoca,inputs):
    result = []
    for i in range(len(inputs)):
        for j in wordvoca.keys():
            try:
                if wordvoca[j] == inputs[i]:
                    result.append(j)
                else:
                    pass
            except:
                result.append('')

    return result

In [None]:
inputs, outputs = tokenize_by_char(df.word, df.entity)

In [None]:
print('encoded input : ', inputs[0], 'label : ', outputs[0], 'original input sentence : ', decode_num_char(wordvoca,inputs[0]))

In [None]:
# 데이터셋을 섞고 배치사이즈만큼 나눈다
BATCH_SIZE = 16
BUFFER_SIZE = 7836

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
dataset

In [None]:
for x, y in dataset.take(1):
    print(x, y)
    print('-----------------------------------------------')
    print(x.shape, y.shape)

# 4. model design

In [None]:
# 모델을 디자인한다 
# 간단하게 임베딩 후 LSTM 으로 처리하고 활성화함수로는 relu를 사용했다
def get_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(wordvoca), 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_to_id.values()), activation='softmax')
])
    LEARNING_RATE = 0.001
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
    return model

In [None]:
model = get_model()

In [None]:
model.fit(dataset, epochs=15)

In [None]:
# 예측 할 단어들을 wordvoca에 있는 인덱스로 key-value 설정한다
def word_question_procession(words):
    MAX_LENGTH = 10
    inputs = [] 
    for word in words:
        tempword = []
        tempnum = []
        for i in range(len(word)):
            tempword.append(word[i])
        for i in tempword:
            try:
                tempnum.append(wordvoca[i])
            except:
                pass
        if len(tempnum) <= MAX_LENGTH:
            inputs.append(tempnum)
        else:
            print("단어의 길이가 너무 길어요")
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, maxlen=MAX_LENGTH, padding='post', 
    value = tokenizer.char2idx['_PAD_']) # value = 0
    return padded_inputs

In [None]:
input_word = word_question_procession(['빨간색','휴지','파랑색','비','무지개색','오른쪽','노란색','유니폼','상자','옆','위','뿡뿡색'
                                     ])

In [None]:
# 예측 해본다
model.predict(input_word)

In [None]:
# 예측값을 id값으로 출력
prediction = np.argmax(model.predict(input_word), axis=1)
print(prediction)

In [None]:
# id 값을 레이블 값으로 
for p in prediction:
    print(id_to_label[p])

In [None]:
# # 모델 저장하기
# model.save('entity_model.h5')
# 모델 불러오기
tempmodel = keras.models.load_model('entity_model.h5')
tempmodel.summary()

In [None]:
tempmodel.predict(input_word)

In [None]:
prediction = list(np.argmax(tempmodel.predict(input_word), axis=1))

In [None]:
prediction