In [None]:
import re, time, requests, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.datasets import imdb

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#IMDB데이터 준비
word_to_index = imdb.get_word_index()
index_to_word={}
for key, value in word_to_index.items():
    index_to_word[value+3] = key
    
vocab_size = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocab_size)

In [None]:
#
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FoDoDic_ew_진짜_최초라벨링.csv')
X_test = test_data.sample(frac=0.25, random_state=42)
y_test = np.array(test_data['label'])

In [None]:
X_test

Unnamed: 0,number,best_comment,최종
207794,207794,"Damn, everyone’s showing up again. Shit is get...",1.0
297255,297255,TOPFlint is best boi convince me otherwise,1.0
175111,175111,"Space Boy, Studio Ghibli, and Moving a lot?! I...",1.0
352918,352918,Damn Chase is a damn good detective. If I was ...,0.0
130482,130482,Prepare for trouble and make it double. 😈,0.0
...,...,...,...
60463,60463,Awww 3:Gandharva has frozen tears on his face....,1.0
162039,162039,"TOPAh, just casualy killing hundreds ""this is ...",0.0
158210,158210,"Btw, has anyone noticed that these dudes have ...",0.0
265930,265930,TOPYALL QTIP LOOKIN FFIIIINNNEEE THIS EVENING ...,1.0


In [None]:
X_test = []
for sentence in test_data['best_comment']:
    temp_X = word_tokenize(sentence) # 형태소분리
    X_test.append(temp_X)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(str(X_test))
threshold = 3

total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_test)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
y_test = np.array(test_data['label'])


In [None]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
X_train.shape

(25000, 500)

In [None]:
X_test.shape

(374137, 500)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('GRU_model.h5', monitor='acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15

Epoch 00001: acc improved from -inf to 0.76790, saving model to GRU_model.h5
Epoch 2/15

Epoch 00002: acc improved from 0.76790 to 0.88535, saving model to GRU_model.h5
Epoch 3/15

Epoch 00003: acc improved from 0.88535 to 0.91920, saving model to GRU_model.h5
Epoch 4/15

Epoch 00004: acc improved from 0.91920 to 0.93955, saving model to GRU_model.h5
Epoch 5/15

Epoch 00005: acc improved from 0.93955 to 0.95685, saving model to GRU_model.h5
Epoch 6/15

Epoch 00006: acc improved from 0.95685 to 0.96675, saving model to GRU_model.h5
Epoch 7/15

Epoch 00007: acc improved from 0.96675 to 0.97650, saving model to GRU_model.h5
Epoch 8/15

Epoch 00008: acc improved from 0.97650 to 0.98390, saving model to GRU_model.h5
Epoch 9/15

Epoch 00009: acc improved from 0.98390 to 0.98965, saving model to GRU_model.h5
Epoch 00009: early stopping


In [None]:
loaded_model = load_model('GRU_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.4680


In [None]:
#저장
model_json = model.to_json()
with open("/content/drive/MyDrive/imdo_to_FoDo_model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("/content/drive/MyDrive/imdo_to_FoDo_mode.h5")