# Import Library


In [None]:
import pandas as pd
import numpy as np
import pickle
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

In [None]:
import urllib.request
from collections import Counter
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# !pip install kss
# import kss

In [None]:
# !pip install imbalanced-learn
# !pip install -U scikit-learn
# !pip install -U imbalanced-learn
# !pip install delayed

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load Data

In [None]:
data = pd.read_csv("/content/drive/MyDrive/쿠아이 컨퍼런스/preprocessing_final1.csv")
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
data.head()

# Preprocessing by input shape

In [None]:
# from imblearn.under_sampling import *

In [None]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [None]:
# 불용어 정의
stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게', '만', '게임', '겜', '되', '음', '면']

In [None]:
mecab = Mecab() 

train_data['tokenized'] = train_data['content'].apply(mecab.morphs)
train_data['tokenized'] = train_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])
test_data['tokenized'] = test_data['content'].apply(mecab.morphs)
test_data['tokenized'] = test_data['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
# train, test data split

X_train = train_data['tokenized'].values
X_test= test_data['tokenized'].values


In [None]:
label='f_t'
y_train = train_data[label].values
y_test = test_data[label].values

In [None]:
# 정수 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
threshold = 2
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[:3])

In [None]:
# 패딩
print('content의 최대 길이 :',max(len(l) for l in X_train))
print('content의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# 패딩으로 짤리는 데이터 확인용 함수
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

In [None]:
max_len = 256
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

# 성능평가 함수

# 모델링

## version 1

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1, activation='sigmoid'))

In [None]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=4)
mc = ModelCheckpoint('best_model1.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=256, validation_split=0.2)

In [None]:
loaded_model1 = load_model('best_model1.h5')
predicted_version1=loaded_model1.predict(X_test)

In [None]:
p = []
for i in predicted_version1:
    if i >= 0.5:
        p.append(1)
    else:
        p.append(0)
print(classification_report(y_test, p, target_names=['class 0', 'class 1']))    

In [None]:
z = []
for i in predicted_version1:
    z.append(np.argmax(i))
print(classification_report(y_test, z, target_names=['class 0', 'class 1']))

In [None]:
plt.hist(predicted_version1)

## version 2

In [None]:
import os
import numpy as np
import pandas as pd
import csv
import random
import pickle
import collections
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import SimpleRNN
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing import text
from tensorflow.keras.optimizers import Adam

In [None]:
### Preprocessing variables
MODEL_BATCH_SIZE = 128
TOP_WORDS = vocab_size
MAX_POST_LENGTH = max_len
EMBEDDING_VECTOR_LENGTH = 40

### Learning variables
LEARNING_RATE = 0.01
DROPOUT = 0.2
NUM_EPOCHS = 10

In [None]:
model = Sequential()
model.add(
            Embedding(
                TOP_WORDS,
                EMBEDDING_VECTOR_LENGTH,
                input_length=MAX_POST_LENGTH,
                # weights=[embedding_matrix],
                mask_zero=True,
                trainable=True,
            )
        )
# model.add(SimpleRNN(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
# model.add(GRU(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
model.add(
            LSTM(
                EMBEDDING_VECTOR_LENGTH,
                dropout=DROPOUT,
                recurrent_dropout=DROPOUT,
                activation="sigmoid",
                kernel_initializer="zeros",
            )
        )
# model.add(Bidirectional(LSTM(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros')))
model.add(Dense(1, activation="sigmoid"))
optimizer = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
mc = ModelCheckpoint('best_model2.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
model.compile(
            loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
        )
print(model.summary())

In [None]:
history=model.fit(
                    X_train,
                    y_train,
                    epochs=NUM_EPOCHS,
                    batch_size=MODEL_BATCH_SIZE,
                  callbacks=[es, mc],
                  validation_split=0.2

                )



In [None]:
loaded_model2 = load_model('best_model2.h5')
predicted_version2=loaded_model2.predict(X_test)


In [None]:
p = []
for i in predicted_version2:
    if i >= 0.5:
        p.append(1)
    else:
        p.append(0)
print(classification_report(y_test, p, target_names=['class 0', 'class 1']))    

In [None]:
z = []
for i in predicted_version2:
    z.append(np.argmax(i))
print(classification_report(y_test, z, target_names=['class 0', 'class 1']))