# Sentiment Classification with movie reviews

* Sentiment classification wiht IMDB dataset
* Reference code: [TensorFlow official tutorials](https://www.tensorflow.org/tutorials/keras/basic_text_classification?hl=en)

## Import modules

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import time

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import tensorflow as tf
from tensorflow.keras import layers
tf.enable_eager_execution()

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
print('TensorFlow version: {}'.format(tf.__version__))

### Import data

In [None]:
# Load training and eval data from tf.keras
imdb = tf.keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [None]:
train_labels = train_labels.astype(np.float64)
test_labels = test_labels.astype(np.float64)

In [None]:
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

In [None]:
print(train_data[0])

In [None]:
len(train_data[0]), len(train_data[1])

## Prepare dataset

### Convert the integers back to words

In [None]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
print(train_data[0])

In [None]:
decode_review(train_data[0])

### Pad sequences

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data_pad = pad_sequences(train_data,
                               maxlen=256,
                               padding='post',
                               value=word_index["<PAD>"])
test_data_pad = pad_sequences(test_data,
                              maxlen=256,
                              padding='post',
                              value=word_index["<PAD>"])

In [None]:
print(train_data_pad.shape)
print(test_data_pad.shape)

In [None]:
index = 0
print("text: {}\n".format(decode_review(train_data[index])))
print("token: {}\n".format(train_data[index]))
print("pad: {}".format(train_data_pad[index]))

### Create a validation set

In [None]:
x_val = train_data_pad[:10000]
partial_x_train = train_data_pad[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

## Build the model

In [None]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000

model = tf.keras.models.Sequential()
model.add(layers.Embedding(vocab_size, 16))
#model.add(layers.SimpleRNN(units=32, return_sequences=True))
model.add(layers.SimpleRNN(units=16))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

### Compile the model

In [None]:
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Train the model

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=2,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)

## Evaluate the model

In [None]:
results = model.evaluate(test_data_pad, test_labels)

In [None]:
# loss
print("loss value: {:.3f}".format(results[0]))
# accuracy
print("accuracy value: {:.3f}".format(results[1]))

### Print a graph of accuracy and loss over time

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()