# [映画レビューのテキスト分類](https://www.tensorflow.org/tutorials/keras/text_classification?hl=ja)

In [None]:
import tensorflow
from tensorflow import keras

import numpy as np
import pandas as pd

imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# Data Exploration

In [None]:
print(len(train_data))
print(len(test_data))

print(train_data[0])
print(f'len0: {len(train_data[0])}, len1:{len(train_data[1])}')

In [None]:
word_index = imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    # get corresponding key word from word_index dictionaly
    # default value = '?'
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(word_index.get('this'))

decode_review(train_data[0])

# Preprocessing

In [None]:
# Adjust data length

maxlen = 256
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value = word_index['<PAD>'], padding='post', maxlen=maxlen)
test_data  = keras.preprocessing.sequence.pad_sequences(test_data , value = word_index['<PAD>'], padding='post', maxlen=maxlen)

# Modeling

In [None]:
# tensorflowのEmbedding レイヤーは何をするか？
# https://qiita.com/9ryuuuuu/items/e4ee171079ffa4b87424

vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(train_data, train_labels, test_size=0.2)
print(len(X_tr))
print(len(X_te))
print(len(y_tr))
print(len(y_te))

In [None]:
history = model.fit(X_tr, y_tr, epochs=40, batch_size=512, validation_data=(X_te, y_te), verbose=1)

In [None]:
model.evaluate(X_te, y_te, verbose=2)

# History

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

epochs = range(1, len(loss)+1)
loss = history_dict['loss']
val_loss = history_dict['val_loss']
plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.legend()
plt.show()