# 1. Import libraries

In [2]:
import tensorflow as tf
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt


keras = tf.keras
t = Okt()

# 2. fit tokenizer to our datasets

In [3]:
from vectorizer import BaseVectorizer
vectorizer = BaseVectorizer(t.morphs)
tokenizer = BaseVectorizer(t.morphs)

In [5]:
df = pd.read_csv('train_entity.csv',encoding='CP949')

In [6]:
df[0:1]

Unnamed: 0,word,entity
0,빨간색,color


In [7]:
tokenizer.fit(df['word'].values)

scanning was done                                        
105 terms are recognized


<vectorizer.BaseVectorizer at 0x1f8b1055cc8>

In [9]:
tokenizer.get_char2idx()

In [10]:
wordvoca = tokenizer.char2idx

In [11]:
wordvoca

{'_PAD_': 0,
 '_UNK_': 1,
 '_': 2,
 'P': 3,
 'A': 4,
 'D': 5,
 'U': 6,
 'N': 7,
 'K': 8,
 'S': 9,
 'T': 10,
 'E': 11,
 'O': 12,
 '색': 13,
 '보': 14,
 '라': 15,
 '공': 16,
 '과': 17,
 '자': 18,
 '지': 19,
 '갑': 20,
 '위': 21,
 '아': 22,
 '래': 23,
 '맨': 24,
 '빨': 25,
 '간': 26,
 '파': 27,
 '랑': 28,
 '강': 29,
 '란': 30,
 '주': 31,
 '황': 32,
 '노': 33,
 '초': 34,
 '록': 35,
 '녹': 36,
 '연': 37,
 '두': 38,
 '남': 39,
 '하': 40,
 '늘': 41,
 '분': 42,
 '홍': 43,
 '청': 44,
 '적': 45,
 '갈': 46,
 '네': 47,
 '이': 48,
 '비': 49,
 '동': 50,
 '차': 51,
 '축': 52,
 '구': 53,
 '농': 54,
 '피': 55,
 '럭': 56,
 '야': 57,
 '테': 58,
 '니': 59,
 '스': 60,
 '장': 61,
 '난': 62,
 '감': 63,
 '의': 64,
 '책': 65,
 '상': 66,
 '컵': 67,
 '박': 68,
 '게': 69,
 '임': 70,
 '기': 71,
 '컴': 72,
 '퓨': 73,
 '터': 74,
 '텔': 75,
 '레': 76,
 '전': 77,
 '티': 78,
 '가': 79,
 '방': 80,
 '양': 81,
 '말': 82,
 '패': 83,
 '딩': 84,
 '외': 85,
 '투': 86,
 '셔': 87,
 '츠': 88,
 '바': 89,
 '핸': 90,
 '드': 91,
 '폰': 92,
 '휴': 93,
 '대': 94,
 '키': 95,
 '마': 96,
 '우': 97,
 '모': 98,
 '트': 99,
 '

# 3. data preprocessing

In [12]:
label_to_id = {t:i for i,t in enumerate(df.entity.unique())}
id_to_label = {i:t for i,t in enumerate(df.entity.unique())}

In [13]:
print(label_to_id)

{'color': 0, 'thing': 1, 'loc': 2}


In [14]:
MAX_LENGTH = 10
def tokenize_by_char(words,labels):
    inputs , outputs = [], []
    for word,label in zip(words,labels):
        tempword = []
        tempnum = []
        for i in range(len(word)):
            tempword.append(word[i])
        for i in tempword:
            tempnum.append(wordvoca[i])
        if len(tempnum) <= MAX_LENGTH:
            inputs.append(tempnum)
            outputs.append(label_to_id[label])
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=MAX_LENGTH, padding='post', 
        value = tokenizer.char2idx['_PAD_']) # value = 0
  
    return padded_inputs, outputs
            

In [15]:
def decode_num_char(wordvoca,inputs):
    result = []
    for i in range(len(inputs)):
        for j in wordvoca.keys():
            try:
                if wordvoca[j] == inputs[i]:
                    result.append(j)
                else:
                    pass
            except:
                result.append('')

    return result

In [16]:
inputs, outputs = tokenize_by_char(df.word, df.entity)

In [17]:
print('encoded input : ', inputs[0], 'label : ', outputs[0], 'original input sentence : ', decode_num_char(wordvoca,inputs[0]))

encoded input :  [25 26 13  0  0  0  0  0  0  0] label :  0 original input sentence :  ['빨', '간', '색', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_']


In [18]:
BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [19]:
dataset

<PrefetchDataset shapes: ((None, 10), (None,)), types: (tf.int32, tf.int32)>

In [20]:
for x, y in dataset.take(1):
    print(x, y)
    print('-----------------------------------------------')
    print(x.shape, y.shape)

tf.Tensor(
[[ 65   0   0   0   0   0   0   0   0   0]
 [ 93  94  92   0   0   0   0   0   0   0]
 [ 69  70  71   0   0   0   0   0   0   0]
 [ 31  32  13   0   0   0   0   0   0   0]
 [128  55 129  99   0   0   0   0   0   0]
 [115 118  60 119 120   0   0   0   0   0]
 [110 103   0   0   0   0   0   0   0   0]
 [100 103   0   0   0   0   0   0   0   0]
 [ 34  35  13   0   0   0   0   0   0   0]
 [ 58  59  60  16   0   0   0   0   0   0]
 [ 98  59  74   0   0   0   0   0   0   0]
 [ 34  35   0   0   0   0   0   0   0   0]
 [ 81  82   0   0   0   0   0   0   0   0]
 [ 27  30   0   0   0   0   0   0   0   0]
 [ 54  53  16   0   0   0   0   0   0   0]
 [ 27  30  13   0   0   0   0   0   0   0]], shape=(16, 10), dtype=int32) tf.Tensor([1 1 1 0 1 0 2 2 0 1 1 0 1 0 1 0], shape=(16,), dtype=int32)
-----------------------------------------------
(16, 10) (16,)


# 4. model design

In [21]:
print(len(label_to_id.values()))

3


In [22]:
len(wordvoca)

130

In [23]:
def get_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(wordvoca), 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_to_id.values()), activation='softmax')
])
    LEARNING_RATE = 0.001
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
    return model

In [24]:
model = get_model()

In [25]:
model.fit(dataset, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1f8d3d13a48>

In [27]:
def word_question_procession(words):
    MAX_LENGTH = 10
    inputs = [] 
    for word in words:
        tempword = []
        tempnum = []
        for i in range(len(word)):
            tempword.append(word[i])
        for i in tempword:
            try:
                tempnum.append(wordvoca[i])
            except:
                pass
        if len(tempnum) <= MAX_LENGTH:
            inputs.append(tempnum)
        else:
            print("단어의 길이가 너무 길어요")
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, maxlen=MAX_LENGTH, padding='post', 
    value = tokenizer.char2idx['_PAD_']) # value = 0
    return padded_inputs

In [28]:
input_sentence = word_question_procession(['빨간색','휴지','파랑색','비','무지개색','오른쪽','노란색','유니폼','상자','옆','위','뿡뿡색'
                                     ])

In [29]:
model.predict(input_sentence)

array([[9.9997067e-01, 2.9332679e-05, 2.1947630e-09],
       [5.4008409e-04, 9.9943358e-01, 2.6312398e-05],
       [9.9996519e-01, 3.4757668e-05, 2.3448181e-09],
       [1.5138368e-04, 9.9966860e-01, 1.8008229e-04],
       [9.8718774e-01, 1.2812126e-02, 1.4352683e-07],
       [1.5563320e-06, 6.1741233e-04, 9.9938095e-01],
       [9.9995470e-01, 4.5310891e-05, 1.3613193e-09],
       [2.4302609e-03, 9.9754083e-01, 2.8908778e-05],
       [1.7364405e-03, 9.9825102e-01, 1.2494587e-05],
       [2.8220842e-07, 6.4146298e-04, 9.9935824e-01],
       [2.9581034e-07, 5.2315136e-04, 9.9947661e-01],
       [9.9108011e-01, 8.9190099e-03, 9.7126122e-07]], dtype=float32)

In [30]:
prediction = np.argmax(model.predict(input_sentence), axis=1)
print(prediction)

[0 1 0 1 0 2 0 1 1 2 2 0]


In [31]:
for p in prediction:
    print(id_to_label[p])

color
thing
color
thing
color
loc
color
thing
thing
loc
loc
color


In [13]:
# # 모델 저장하기
# model.save('entitytestmodel.h5')
# 모델 불러오기
tempmodel = keras.models.load_model('entity_model.h5')
tempmodel.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          8320      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 82,819
Trainable params: 82,819
Non-trainable params: 0
_________________________________________________________________


In [14]:
tempmodel.predict(input_sentence)

array([[9.99997258e-01, 2.36760138e-06, 3.79820023e-07],
       [4.01552606e-05, 9.99946475e-01, 1.33432895e-05],
       [9.99997973e-01, 1.85279464e-06, 1.44728048e-07],
       [1.45368362e-02, 9.83871937e-01, 1.59128278e-03],
       [7.20760524e-01, 2.78951228e-01, 2.88266368e-04],
       [6.82854501e-04, 3.85420630e-04, 9.98931706e-01],
       [9.99993563e-01, 6.26468636e-06, 1.59969190e-07],
       [2.25275682e-04, 9.99717891e-01, 5.68391297e-05],
       [7.61514675e-05, 9.99898911e-01, 2.49588320e-05],
       [4.26918385e-04, 3.24093970e-04, 9.99248922e-01],
       [4.10627166e-04, 3.02611763e-04, 9.99286830e-01],
       [9.87252474e-01, 9.90190543e-03, 2.84563052e-03]], dtype=float32)

In [16]:
prediction = list(np.argmax(tempmodel.predict(input_sentence), axis=1))

In [17]:
prediction

[0, 1, 0, 1, 0, 2, 0, 1, 1, 2, 2, 0]

In [37]:
del model