# 1. Import libraries

In [1]:
try:
  # Colab only
  %tensorflow_version 2.x
except Exception:
    pass

In [2]:
# !pip install konlpy

In [3]:
# !pip install gensim

In [4]:
#colab에서 실행 중이라면...
# !git clone https://github.com/hukim1112/comment_classifier.git
# import os
# os.chdir('/content/comment_classifier')

In [5]:
import tensorflow as tf
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt


keras = tf.keras
t = Okt()

# 2. fit tokenizer to our datasets

In [9]:
from vectorizer import BaseVectorizer
tokenizer = BaseVectorizer(t.morphs)

In [10]:
df = pd.read_csv('train_intent.csv',encoding='CP949')

In [11]:
df[0:1]

Unnamed: 0,question,intent
0,인터넷 켜줘,internet


In [12]:
tokenizer.fit(df['question'].values)

scanning was done                                        
64 terms are recognized


<vectorizer.BaseVectorizer at 0x237f8a4d708>

In [13]:
tokenizer.vocabulary_

{'_PAD_': 0,
 '_UNK_': 1,
 '_STA_': 2,
 '_EOS_': 3,
 '인터넷': 4,
 '그림': 5,
 '봐': 6,
 '켜': 7,
 '좀': 8,
 '그리고': 9,
 '나': 10,
 '뭐': 11,
 '그릴': 12,
 '그림판': 13,
 '할래': 14,
 '크롬': 15,
 '틀어': 16,
 '그': 17,
 '리자': 18,
 '싶어': 19,
 '켜줘': 20,
 '이나': 21,
 '한': 22,
 '래': 23,
 '컴퓨터': 24,
 '로': 25,
 '알아보게': 26,
 '구글': 27,
 '음': 28,
 '이': 29,
 '싶다': 30,
 '만': 31,
 '거': 32,
 '오늘': 33,
 '은': 34,
 '그려': 35,
 '보자': 36,
 '지금': 37,
 '심심한데': 38,
 '홈페이지': 39,
 '띄워': 40,
 '하자': 41,
 '번': 42,
 '할거야': 43,
 '그릴거야': 44,
 '틀어줘': 45,
 '그려야': 46,
 '징': 47,
 '그려야지': 48,
 '하고싶은데': 49,
 '하고': 50,
 '할': 51,
 '수': 52,
 '있나': 53,
 '하고싶네': 54,
 '할까': 55,
 '웹서핑': 56,
 '검색': 57,
 '하게': 58,
 '그리게': 59,
 '싶은데': 60,
 '싶네': 61,
 '아': 62,
 '그려야겠다': 63}

# 3. data preprocessing

In [14]:
label_to_id = {t:i for i,t in enumerate(df.intent.unique())}
id_to_label = {i:t for i,t in enumerate(df.intent.unique())}

In [15]:
print(label_to_id)

{'internet': 0, 'paint': 1}


In [16]:
# df.intent = df.intent.map(lambda x : label_index[x])
# print(df.head(10))

In [17]:
MAX_LENGTH = 40
def tokenize_and_filter(sentences, labels):
    inputs, outputs = [], []
  
    for sentence, label in zip(sentences, labels):
    # tokenize sentence
        tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
        
    # check tokenized sentence max length
        if len(tokenized_sentence) <= MAX_LENGTH:
            inputs.append(tokenized_sentence)
#             print("input append")
            outputs.append(label_to_id[label])
  
  # pad tokenized sentences
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        inputs, maxlen=MAX_LENGTH, padding='post', 
        value = tokenizer.vocabulary_['_PAD_']) # value = 0
  
    return padded_inputs, outputs

In [18]:
inputs, outputs = tokenize_and_filter(df.question, df.intent)

In [19]:
print('encoded input : ', inputs[0], 'label : ', outputs[0], 'original input sentence : ', tokenizer.decode_from_list(inputs[0]))

encoded input :  [ 4 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0] label :  0 original input sentence :  ['인터넷', '켜줘', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_']


In [20]:
BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [21]:
dataset

<PrefetchDataset shapes: ((None, 40), (None,)), types: (tf.int32, tf.int32)>

In [22]:
for x, y in dataset.take(1):
    print(x, y)
    print('-----------------------------------------------')
    print(x.shape, y.shape)

tf.Tensor(
[[ 4 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 43  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 4 51 52 53  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [57 58  4  8  7  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5 48  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [12 31 22 32 16  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5 44  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [13  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0

# 4. model design

In [23]:
print(len(label_to_id.values()))

2


In [24]:
def get_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.n_vocabs, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(label_to_id.values()), activation='softmax')
])
    LEARNING_RATE = 0.001
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
    return model

In [25]:
model = get_model()

In [26]:
# LEARNING_RATE = 0.001

In [27]:
# model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
#               loss=tf.keras.losses.sparse_categorical_crossentropy,
#               metrics=[tf.keras.metrics.sparse_categorical_accuracy])

In [38]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x23790f115c8>

In [29]:
def question_processing(sentences):
    inputs = []
    for sentence in sentences:
        # tokenize sentence
        tokenized_sentence = tokenizer.encode_a_doc_to_list(sentence)
        # check tokenized sentence max length
        if len(tokenized_sentence) <= MAX_LENGTH:
            inputs.append(tokenized_sentence)
        else:
            print('입력이 너무 길어요.')
    # pad tokenized sentences
    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, maxlen=MAX_LENGTH, padding='post', 
    value = tokenizer.vocabulary_['_PAD_']) # value = 0
    return padded_inputs

In [47]:
input_sentence = question_processing(['나 인터넷 해볼래', 
                                      '나 그림 그릴래',
                                      '나는 아무것도 하기 싫어',
                                      '영화 보게 아무거나 켜봐',
                                      '뀨잉'
                                     ])

In [48]:
model.predict(input_sentence)

array([[9.7766429e-01, 2.2335680e-02],
       [4.8151938e-05, 9.9995184e-01],
       [2.4778335e-01, 7.5221664e-01],
       [9.5357037e-01, 4.6429671e-02],
       [2.5614837e-01, 7.4385160e-01]], dtype=float32)

In [49]:
prediction = np.argmax(model.predict(input_sentence), axis=1)
print(prediction)

[0 1 1 0 1]


In [50]:
for p in prediction:
    print(id_to_label[p])

internet
paint
paint
internet
paint


In [36]:
# 모델 저장하기
model.save('my_model.h5')
# 모델 불러오기
tempmodel = keras.models.load_model('my_model.h5')
tempmodel.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          4096      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
Total params: 78,530
Trainable params: 78,530
Non-trainable params: 0
_________________________________________________________________


ImportError: `load_weights` requires h5py when loading weights from HDF5.

In [70]:
del model

# 데이터 추가해보기

In [None]:
names = ['안중근', '이순신', '세종대왕', '김광석', '아이유', '에미넴', '이건희', '고아라', '유재석', '한석희', '최민성']
def question_generator(names):
    question = []
    for name in names:
        s1 = name+'는 어떤 분이야?'
        s2 = name+'은 어떤 사람이니?'
        s3 = name+'이란 사람에 대해 궁금해'
        question = question+[s1, s2, s3]
    return question
question = question_generator(names)

In [None]:
question

In [None]:
new_data = {'question' : question, 'intent' : ['인물']*len(question)}
add_df = pd.DataFrame(new_data, columns=('question', 'intent'))

In [None]:
add_df.head(5)

In [None]:
print(len(df), len(add_df))

In [None]:
new_df = pd.concat([df, add_df])
print(len(new_df))

In [None]:
tokenizer.fit(new_df['question'].values)

In [None]:
new_inputs, new_outputs = tokenize_and_filter(new_df.question, new_df.intent)

BATCH_SIZE = 16
BUFFER_SIZE = 7836

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((new_inputs, new_outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
new_model = get_model()
LEARNING_RATE = 0.0001
new_model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])
new_model.fit(dataset, epochs=10)

In [None]:
input_sentence = question_processing(['서울 날씨 어때?', 
                                      '나는 전주 날씨 궁금함',
                                      '안중근 의사는 누구야?',
                                      '박소희는 어떤 사람인지 궁금해.',
                                      '명동 맛있는 음식점 있니?'
                                     ])

In [None]:
new_model.predict(input_sentence)

In [None]:
prediction = np.argmax(new_model.predict(input_sentence), axis=1)

In [None]:
for p in prediction:
    print(id_to_label[p])