In [11]:
from konlpy.tag import Okt
okt = Okt()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 50
max_words = 2000001

tokenizer = Tokenizer(num_words=max_words)

import json

In [12]:

from keras import backend as K
def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())

    # return a single tensor value
    return precision


def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    # return a single tensor value
    return _f1score


In [13]:
import json

#abusive
#slang
#sexual

with open('model\wordIndex_sexual.json') as json_file:
    word_index = json.load(json_file)
    tokenizer.word_index = word_index

from keras.models import load_model
model = load_model('model\cnn-lstm\model_sexual_cl.h5', custom_objects = {"precision": precision, "recall" : recall, "f1score" : f1score})

In [14]:
texts = []
labels = []

f1 = open("dataset\\sexual_testing_text.txt", 'r', encoding='utf-8')
f2 = open("dataset\\sexual_testing_label.txt", 'r', encoding='utf-8')

while True:
    textline = f1.readline()
    labelline = f2.readline()
    if not textline: break
    texts.append(textline[:-1])
    labels.append(int(labelline[:-1]))

f1.close()
f2.close()

In [15]:
morpheme = []

for text in texts:
    union = ""
    for word_tag in okt.pos(text, norm=True, stem=True):
        if word_tag[1] in ['Noun', 'Verb', 'VerbPrefix', 'Adjective', 'Determiner', 'Adverb', 'Exclamation', 'KoreanParticle']:
            union += word_tag[0]
            union += " "
    morpheme.append(union)  

In [16]:
tokenizer.fit_on_texts(word_index)
sequences = tokenizer.texts_to_sequences(morpheme)
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
x_test = data[indices]
y_test = labels[indices]

In [17]:
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score 


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', precision, recall, f1score]) 

In [18]:
value_predicted = model.predict(x_test)
labels_c1 = []
for i in range(0, len(x_test)):
    labels_c1.extend([round(value_predicted[i][0] * 100, 1)])

In [19]:
_loss, _acc, _precision, _recall, _f1score = model.evaluate(x_test, y_test)
print('loss: {:.3f}, accuracy: {:.3f}, precision: {:.3f}, recall: {:.3f}, f1score: {:.3f}'.format(_loss, _acc, _precision, _recall, _f1score))

loss: 0.327, accuracy: 0.877, precision: 0.881, recall: 0.871, f1score: 0.872


In [20]:
value_evaluated = model.evaluate(x_test, y_test)
print("정확도는", round(value_evaluated[1]*100, 2), "%입니다.")

정확도는 87.75 %입니다.
