# 1. Import libraries

In [4]:
try:
  # Colab only
  %tensorflow_version 2.x
except Exception:
    pass

In [5]:
# !pip install konlpy

In [6]:
# !pip install gensim

In [7]:
#colab에서 실행 중이라면...
# !git clone https://github.com/hukim1112/comment_classifier.git
# import os
# os.chdir('/content/comment_classifier')

In [8]:
import tensorflow as tf
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt


keras = tf.keras
t = Okt()

# 2. fit tokenizer to our datasets

In [9]:
from vectorizer import BaseVectorizer
vectorizer = BaseVectorizer(t.morphs)
tokenizer = BaseVectorizer(t.morphs)

In [10]:
df = pd.read_csv('Entitytest.csv',encoding='CP949')

In [11]:
df[0:1]

Unnamed: 0,word,entity
0,빨간색,color


In [12]:
tokenizer.fit(df['word'].values)

scanning was done                                        
105 terms are recognized


<vectorizer.BaseVectorizer at 0x262ffd24f08>

In [13]:
tokenizer.vocabulary_

{'_PAD_': 0,
 '_UNK_': 1,
 '_STA_': 2,
 '_EOS_': 3,
 '색': 4,
 '보라색': 5,
 '공': 6,
 '과자': 7,
 '지갑': 8,
 '위': 9,
 '아래': 10,
 '맨': 11,
 '빨간색': 12,
 '파랑색': 13,
 '빨강': 14,
 '파란색': 15,
 '주황색': 16,
 '노란색': 17,
 '노랑': 18,
 '초록색': 19,
 '녹색': 20,
 '연두색': 21,
 '남색': 22,
 '자주색': 23,
 '하늘색': 24,
 '연': 25,
 '분홍색': 26,
 '청록색': 27,
 '청색': 28,
 '적색': 29,
 '갈색': 30,
 '네이비': 31,
 '자동차': 32,
 '축구공': 33,
 '농구공': 34,
 '피구': 35,
 '럭비공': 36,
 '야구공': 37,
 '테니스공': 38,
 '장난감': 39,
 '의자': 40,
 '책': 41,
 '책상': 42,
 '컵': 43,
 '상자': 44,
 '박스': 45,
 '게임기': 46,
 '컴퓨터': 47,
 '텔레비전': 48,
 '티비': 49,
 '가방': 50,
 '양말': 51,
 '패딩': 52,
 '외투': 53,
 '티셔츠': 54,
 '셔츠': 55,
 '바지': 56,
 '청바지': 57,
 '핸드폰': 58,
 '휴대폰': 59,
 '키': 60,
 '보드': 61,
 '마우스': 62,
 '모니터': 63,
 '노트북': 64,
 '공책': 65,
 '오른쪽': 66,
 '왼쪽': 67,
 '좌회전': 68,
 '우회': 69,
 '전': 70,
 '좌측': 71,
 '우측': 72,
 '저기': 73,
 '여기': 74,
 '저쪽': 75,
 '동쪽': 76,
 '서쪽': 77,
 '남쪽': 78,
 '북쪽': 79,
 '동': 80,
 '서': 81,
 '남': 82,
 '북': 83,
 '옆': 84,
 '안': 85,
 '붉': 86,
 '은색': 87,
 '푸른색': 88,


# 3. data preprocessing

In [14]:
label_to_id = {t:i for i,t in enumerate(df.entity.unique())}
id_to_label = {i:t for i,t in enumerate(df.entity.unique())}

In [15]:
print(label_to_id)

{'color': 0, 'thing': 1, 'loc': 2}


In [16]:
# df.intent = df.intent.map(lambda x : label_index[x])
# print(df.head(10))

In [17]:
MAX_LENGTH = 10
def tokenize_and_filter(sentences, labels):
    inputs, outputs = [], []
  
    for sentence, label in zip(sentences, labels):
    # tokenize sentence
        tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
        
    # check tokenized sentence max length
        if len(tokenized_sentence) <= MAX_LENGTH:
            inputs.append(tokenized_sentence)
#             print("input append")
            outputs.append(label_to_id[label])
  
  # pad tokenized sentences
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=MAX_LENGTH, padding='post', 
        value = tokenizer.vocabulary_['_PAD_']) # value = 0
  
    return padded_inputs, outputs

In [18]:
inputs, outputs = tokenize_and_filter(df.word, df.entity)

In [19]:
print('encoded input : ', inputs[0], 'label : ', outputs[0], 'original input sentence : ', tokenizer.decode_from_list(inputs[0]))

encoded input :  [12  0  0  0  0  0  0  0  0  0] label :  0 original input sentence :  ['빨간색', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_']


In [20]:
BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [21]:
dataset

<PrefetchDataset shapes: ((None, 10), (None,)), types: (tf.int32, tf.int32)>

In [22]:
for x, y in dataset.take(1):
    print(x, y)
    print('-----------------------------------------------')
    print(x.shape, y.shape)

tf.Tensor(
[[ 54   0   0   0   0   0   0   0   0   0]
 [ 40   0   0   0   0   0   0   0   0   0]
 [ 62   0   0   0   0   0   0   0   0   0]
 [ 72   0   0   0   0   0   0   0   0   0]
 [103   0   0   0   0   0   0   0   0   0]
 [ 64   0   0   0   0   0   0   0   0   0]
 [ 76   0   0   0   0   0   0   0   0   0]
 [ 20   0   0   0   0   0   0   0   0   0]
 [ 93   0   0   0   0   0   0   0   0   0]
 [ 53   0   0   0   0   0   0   0   0   0]
 [ 77   0   0   0   0   0   0   0   0   0]
 [ 24   0   0   0   0   0   0   0   0   0]
 [ 28   0   0   0   0   0   0   0   0   0]
 [ 67   0   0   0   0   0   0   0   0   0]
 [100   0   0   0   0   0   0   0   0   0]
 [ 73   0   0   0   0   0   0   0   0   0]], shape=(16, 10), dtype=int32) tf.Tensor([1 1 1 2 1 1 2 0 0 1 2 0 0 2 1 2], shape=(16,), dtype=int32)
-----------------------------------------------
(16, 10) (16,)


# 4. model design

In [23]:
print(len(label_to_id.values()))

3


In [24]:
def get_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.n_vocabs, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_to_id.values()), activation='softmax')
])
    LEARNING_RATE = 0.001
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
    return model

In [25]:
model = get_model()

In [26]:
# LEARNING_RATE = 0.001

In [27]:
# model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
#               loss=tf.keras.losses.sparse_categorical_crossentropy,
#               metrics=[tf.keras.metrics.sparse_categorical_accuracy])

In [29]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x262950cb3c8>

In [30]:
def question_processing(sentences):
    MAX_LENGTH = 10
    inputs = []
    for sentence in sentences:
        # tokenize sentence
        tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
        # check tokenized sentence max length
        if len(tokenized_sentence) <= MAX_LENGTH:
            inputs.append(tokenized_sentence)
        else:
            print('입력이 너무 길어요.')
    # pad tokenized sentences
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, maxlen=MAX_LENGTH, padding='post', 
    value = tokenizer.vocabulary_['_PAD_']) # value = 0
    return padded_inputs

In [35]:
inputdata = ['빨간색','휴지','파랑색',
                                      '비','무지개색','오른쪽',
                                      '노란색','유니폼','상자','옆','위'
                                     ]
input_sentence = question_processing(inputdata)



In [32]:
model.predict(input_sentence)

array([[9.3053555e-01, 1.1451218e-04, 6.9349982e-02],
       [7.4300139e-08, 9.9999857e-01, 1.3500603e-06],
       [8.8910103e-01, 1.8148922e-04, 1.1071754e-01],
       [5.0711352e-02, 1.7850595e-02, 9.3143803e-01],
       [9.7390944e-01, 4.5281144e-05, 2.6045289e-02],
       [3.4667052e-02, 1.9966452e-03, 9.6333629e-01],
       [8.0459964e-01, 2.9667150e-04, 1.9510366e-01],
       [5.0711352e-02, 1.7850595e-02, 9.3143803e-01],
       [1.2995899e-07, 9.9999797e-01, 1.9009082e-06],
       [3.2586340e-02, 1.8653836e-03, 9.6554828e-01],
       [1.6502133e-02, 2.0438556e-03, 9.8145401e-01]], dtype=float32)

In [33]:
prediction = np.argmax(model.predict(input_sentence), axis=1)
print(prediction)

[0 1 0 2 0 2 0 2 1 2 2]


In [39]:
text = ''
for i, j in zip(inputdata, prediction):
    text += (i+' : '+id_to_label[j]+'\n')

In [41]:
print(text)

빨간색 : color
휴지 : thing
파랑색 : color
비 : loc
무지개색 : color
오른쪽 : loc
노란색 : color
유니폼 : loc
상자 : thing
옆 : loc
위 : loc



In [34]:
text = ''

for p in prediction:
    print(id_to_label[p])

color
thing
color
loc
color
loc
color
loc
thing
loc
loc


In [33]:
# 모델 저장하기
model.save('entitytestmodel.h5')
# 모델 불러오기
tempmodel = keras.models.load_model('entitytestmodel.h5')
tempmodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          6720      
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 81,219
Trainable params: 81,219
Non-trainable params: 0
_________________________________________________________________


In [34]:
tempmodel.predict(input_sentence)

array([[9.5373678e-01, 8.5971355e-05, 4.6177287e-02],
       [5.6485320e-08, 9.9999642e-01, 3.6223596e-06],
       [9.3436652e-01, 1.3932100e-04, 6.5494195e-02],
       [9.9562109e-04, 9.2088032e-01, 7.8124106e-02],
       [9.4302708e-01, 1.3661891e-04, 5.6836307e-02],
       [3.1226311e-02, 4.0562011e-02, 9.2821169e-01],
       [8.8532829e-01, 3.3773883e-04, 1.1433393e-01],
       [9.9562109e-04, 9.2088032e-01, 7.8124106e-02],
       [9.6571647e-08, 9.9999440e-01, 5.5115884e-06],
       [2.4339160e-02, 5.0067291e-02, 9.2559355e-01],
       [2.3435676e-02, 6.0822763e-02, 9.1574150e-01]], dtype=float32)

In [35]:
list(np.argmax(model.predict(input_sentence), axis=1))

[0, 1, 0, 1, 0, 2, 0, 1, 1, 2, 2]

In [36]:
for p in prediction:
    print(id_to_label[p])

color
thing
color
thing
color
loc
color
thing
thing
loc
loc


In [37]:
del model

# 데이터 추가해보기

In [None]:
names = ['안중근', '이순신', '세종대왕', '김광석', '아이유', '에미넴', '이건희', '고아라', '유재석', '한석희', '최민성']
def question_generator(names):
    question = []
    for name in names:
        s1 = name+'는 어떤 분이야?'
        s2 = name+'은 어떤 사람이니?'
        s3 = name+'이란 사람에 대해 궁금해'
        question = question+[s1, s2, s3]
    return question
question = question_generator(names)

In [None]:
question

In [None]:
new_data = {'question' : question, 'intent' : ['인물']*len(question)}
add_df = pd.DataFrame(new_data, columns=('question', 'intent'))

In [None]:
add_df.head(5)

In [None]:
print(len(df), len(add_df))

In [None]:
new_df = pd.concat([df, add_df])
print(len(new_df))

In [None]:
tokenizer.fit(new_df['question'].values)

In [None]:
new_inputs, new_outputs = tokenize_and_filter(new_df.question, new_df.intent)

BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((new_inputs, new_outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
new_model = get_model()
LEARNING_RATE = 0.0001
new_model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
new_model.fit(dataset, epochs=10)

In [None]:
input_sentence = question_processing(['서울 날씨 어때?', 
                                      '나는 전주 날씨 궁금함',
                                      '안중근 의사는 누구야?',
                                      '박소희는 어떤 사람인지 궁금해.',
                                      '명동 맛있는 음식점 있니?'
                                     ])

In [None]:
new_model.predict(input_sentence)

In [None]:
prediction = np.argmax(new_model.predict(input_sentence), axis=1)

In [None]:
for p in prediction:
    print(id_to_label[p])

In [216]:
from konlpy.tag import Kkma
from konlpy.utils import pprint
kkma = Kkma()

In [217]:

def 형태소분석(text):
    형태소 = kkma.pos(text)
    명사 = []
    for i in 형태소:
        if i[1] == 'NNG':
            명사.append(i[0])
        else:
            pass
    return 명사

def entity분석(명사):
    inputdata = question_processing(명사)
    prediction = list(np.argmax(tempmodel.predict(inputdata),axis=1))
    print(명사)
    print(prediction)
    result = {}
    for a,b in zip(명사,prediction):
        if b == 0:
            result[a] = 'color'
        elif b == 1:
            result[a] = 'thing'
        elif b == 2:
            result[a] = 'loc'
    return result  


In [221]:
tempmodel = keras.models.load_model('entitytestmodel.h5')
noun = 형태소분석('빨간색 공책 녹색 상자 안에 넣어')
result = entity분석(noun)

['빨간색', '공책', '녹색', '상자', '안']
[0, 1, 0, 1, 2]


In [222]:
MAX_LENGTH = 10
def tokenize_by_char(words,labels):
    inputs , outputs = [], []
    for word,label in zip(words,labels):
        tempword = []
        tempnum = []
        for i in range(len(word)):
            tempword.append(word[i])
        for i in tempword:
            tempnum.append(wordvoca[i])
        if len(tempnum) <= MAX_LENGTH:
            inputs.append(tempnum)
            outputs.append(label_to_id[label])
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=MAX_LENGTH, padding='post', 
        value = tokenizer.char2idx['_PAD_']) # value = 0
  
    return padded_inputs, outputs
            

{'빨간색': 'color', '공책': 'thing', '녹색': 'color', '상자': 'thing', '안': 'loc'}