In [1]:
from gensim import models
from gensim.models import FastText
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from konlpy.tag import Okt

In [2]:
from keras import backend as K

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)
    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())
    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 
    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)
    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())
    # return a single tensor value
    return precision

def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    # return a single tensor value
    return _f1score

In [3]:
f1 = open("800_text.txt", 'r', encoding='utf-8')
f2 = open("800_label.txt", 'r', encoding='utf-8')

examples = []
labels = []

while True:
    textline = f1.readline()
    labelline = f2.readline()
    if not textline: break
    examples.append(textline[:-1])
    labels.append(int(labelline[:-1]))

f1.close()
f2.close()

In [4]:
okt = Okt()
ex_morpheme = []
for text in examples:
    union = ""
    for word_tag in okt.pos(text, norm=True, stem=True):
        if word_tag[1] in ['Noun', 'Verb', 'VerbPrefix', 'Adjective', 'Determiner', 'Adverb', 'Exclamation', 'KoreanParticle']:
            union += word_tag[0]
            union += " "
    ex_morpheme.append(union)

maxlen = 500
max_words = 2000001
tokenizer = Tokenizer(num_words=max_words)

import json
with open('model\wordIndex_abusive.json') as json_file:
    word_index = json.load(json_file)
    tokenizer.word_index = word_index
    
tokenizer.fit_on_texts(word_index)
sequences = tokenizer.texts_to_sequences(ex_morpheme)
x_test = pad_sequences(sequences, maxlen=maxlen)

In [5]:
model = load_model('model\cnn-lstm\model_abusive_cl.h5', custom_objects = {"precision": precision, "recall" : recall, "f1score" : f1score})
value_predicted = model.predict(x_test)

# 욕설 라벨링 저장용
labels_c1 = []
for i in range(0, len(x_test)):
    labels_c1.extend([round(value_predicted[i][0] * 100, 1)])


In [6]:
f3 = open("test_text123.txt", 'w', encoding='utf-8')
for i in range(0, len(x_test)):
    input_text = examples[i] + " | " + str(labels[i]) + " | " + str(labels_c1[i]) + "\n"
    f3.write(input_text)

f3.close()


In [7]:
f4 = open("test_false_positive.txt", 'w', encoding='utf-8')
f5 = open("test_false_negative.txt", 'w', encoding='utf-8')
for i in range(0, len(x_test)):
    input_text = examples[i] + " | " + str(labels[i]) + " | " + str(labels_c1[i]) + "\n"
    if(labels[i]==0 and labels_c1[i] >= 50):
        f4.write(input_text)
    elif(labels[i]==1 and labels_c1[i] < 50):
        f5.write(input_text)

f4.close()
f5.close()