<a href="https://colab.research.google.com/github/hobin-jang/colab_test/blob/master/IMDb_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
IMDb 영화 리뷰 데이터셋, 리뷰가 긍정인지 부정인지 판단
"""
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

In [None]:
# 데이터셋 : 미리 전처리되어 있음. 각 정수는 하나의 단어. 등장 빈도에 따라 인덱스 붙힘. 낮은 정수일수록 빈도 높음
x_train[0][:10]

In [None]:
word_index = tf.keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

In [None]:
# <pad>, <sos>, <unk> : 인덱스 0,1,2, 각각 패딩, sos, 알 수 없음
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
  id_to_word[id_] = token

print(" ".join([id_to_word[id_] for id_ in x_train[0][:10]]))

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

In [None]:
def preprocess(x_batch, y_batch):
  x_batch = tf.strings.substr(x_batch, 0, 300)
  x_batch = tf.strings.regex_replace(x_batch, b"<br\\s*/?>", b" ")
  x_batch = tf.strings.regex_replace(x_batch, b"[^a-zA-Z]", b" ")
  x_batch = tf.strings.split(x_batch)
  return x_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
from collections import Counter
vocabulary = Counter()
for x_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in x_batch:
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:3]

In [None]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant([b"This movie was faaaaantastic".split()]))

In [None]:
def encode_words(x_batch, y_batch):
  return table.lookup(x_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embed_size = 128
model = tf.keras.Sequential([
              tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
              tf.keras.layers.GRU(128, return_sequences=True),
              tf.keras.layers.GRU(128),
              tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.summary()

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=10)

In [None]:
plt.plot(history.history["accuracy"], label="accuracy")
plt.plot(history.history["loss"], label="loss")
plt.ylim(0,1)
plt.legend(loc="lower left")
plt.show()