<a href="https://colab.research.google.com/github/hyxxnii/Tave-6th-Project/blob/master/NIPA%20%EB%B3%B8%EC%84%A0%20-%20%EC%9C%A1%EA%B5%B0%20%EB%AF%BC%EC%9B%90%20%EB%B6%84%EB%A5%98%20%EB%AA%A8%EB%8D%B8_try2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

train_path = '../data/.train/.task145/data/train.tsv'
test_path = '../data/.train/.task145/data/test.tsv'

In [None]:
train_df = pd.read_csv(train_path, sep='\t')
train_df.head()

In [None]:
def read_documents(filename):
    with open(filename, encoding="utf-8") as f: # 윈도우는 꼭 encoding 해줘야 함
        documents = [line.split('\t') for line in f.read().splitlines()] 
        documents = documents[1:] # 첫번째 줄이 카테고리 이름적혀있는 줄이라서 날려버림
        
    return documents

test = read_documents(test_path)
test_df = pd.DataFrame(test)
test_df.columns = ["comment", "tag"]
test_df.head()

In [None]:
import konlpy 
from konlpy.tag import Mecab, Kkma, Okt, Komoran
import json
import os
import re
from pprint import pprint

def text_cleaning(doc):
    # 한국어를 제외한 글자를 제거하는 함수.
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
    return doc

def define_stopwords(path):
    SW = set()
    with open(path, encoding='utf-8') as f:
        for word in f:
            SW.add(word)
            
    return SW

# okt.morphs(sentence)

def text_tokenizing(doc):
    return [word for word in mecab.morphs(doc) if word not in SW and len(word) > 1]

In [None]:
import konlpy 
from konlpy.tag import Mecab, Kkma, Okt, Komoran
import json
import os
import re
from pprint import pprint

# 형태소 분석기 불러오기
okt = Okt() # 혹시나 mecab 설치 실패한 분들 위해서
mecab = Mecab()

SW = define_stopwords("./stopwords-ko.txt")

for i in range(len(train_df)):
    text = text_cleaning(train_df['comment'][i])
    train_df['comment'][i] = text

## Deep Neural Network로 분류하기

In [None]:
# 필요한 라이브러리 불러오기
import numpy as np
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences 

## Set Hyperparameter 

In [None]:
max_words = 35000 # feature selection 방법
max_len = 20 # 문서의 최대 길이
batch_size = 16
EPOCHS = 10

In [None]:
from sklearn.model_selection import train_test_split

y_target = train_df['tag']
X_features = train_df.drop('tag', axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, 
                                                    test_size=0.2,
                                                   random_state=156)

In [None]:
train_words = [] 
for i in range(len(X_train)):
    text = text_tokenizing(X_train['comment'].iloc[i])
    train_words.append(text)
    
test_words = []
for i in range(len(X_test)):
    text = text_tokenizing(X_test['comment'].iloc[i])
    test_words.append(text)

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_words) # tokenizer쓸 때 꼭 fit을 해줘야 해

# LSTM의 input으로 넣기 위한 변환 작업
tokenizer_train_words = tokenizer.texts_to_sequences(train_words)
tokenizer_test_words = tokenizer.texts_to_sequences(test_words)

# 크기를 맞춰주기 위한 zero padding
X_train = pad_sequences(tokenizer_train_words, padding='pre', maxlen=max_len)
X_test = pad_sequences(tokenizer_test_words, padding='pre', maxlen=max_len)
# value=0 : padding할 때 0으로 채워라(default값이긴 하지만 좀 더 명시해주기위해)
# padding='pre' : 앞부터 채워라

# 학습 가능한 형태로 최종 변환.
# # ds : data structure
# train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(batch_size) 
# test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
# 10000개씩 shuffle, test에서는 당연히 shuffle 필요 없겠지

In [None]:
Y_TRAIN = []
Y_TEST = []
for i in range(len(y_train)):
    if y_train.iloc[i] == 2:
        Y_TRAIN.append([0, 0, 1])
    elif y_train.iloc[i] == 1:
        Y_TRAIN.append([0, 1, 0])
    elif y_train.iloc[i] == 0:
        Y_TRAIN.append([1, 0, 0])
        
for i in range(len(y_test)):
    if y_test.iloc[0] == 2:
        Y_TEST.append([0, 0, 1])    
    elif y_test.iloc[0] == 1:
        Y_TEST.append([0, 1, 0])
    elif y_test.iloc[0] == 0: 
        Y_TEST.append([1, 0, 0])
        
Y_TRAIN = np.array(Y_TRAIN)
Y_TEST = np.array(Y_TEST)

## Set Model 

In [None]:
from keras import backend as K

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())

    # return a single tensor value
    return precision


def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    # return a single tensor value
    return _f1score

In [None]:
model = Sequential() 
model.add(Embedding(max_words, 100)) 
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[recall, precision, f1score]) 
earlystopper = tf.keras.callbacks.EarlyStopping(monitor="val_f1score", patience=10, verbose=1)
history = model.fit(X_train, Y_TRAIN, epochs=EPOCHS, batch_size=batch_size, callbacks=[earlystopper])



In [None]:
# 실행, 결과 저장.

predict = model.predict(X_test)
predict

In [None]:
result = [np.argmax(value) for value in predict]
result

In [None]:
loss_and_metrics = model.evaluate(X_test, Y_TEST, batch_size=16)
print('## evaluation loss and_metrics ##')
print(loss_and_metrics)