# Predicting if a review is positive or negative based on the text
A dataset is available that has labelled a lot of movie reviews as being positive or negative. ML models do not work with words, so we have to transform them somehow into numbers. The approach is to create an index where each word gets a unique number. Even with an amount of 1000 most used words, it is already possible to learn to recognise positive and negative texts.

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import matplotlib.pyplot as plt

# Loading the data into chunks of test and training data. 
The labels are 1 and 0, the data number from 0 - 1000 if we have limited the amount of words to a 1000. After loading the data we also show some basic numbers that explain the data as well as some samples.

In [None]:
NUM_WORDS=1000 # only use top 1000 words
INDEX_FROM=3   # word index offset


(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
print(f'Shape of the training data {train_data.shape}')
print(f'Shape of the training labels {train_labels.shape}')
print(f'Shape of the test data {test_data.shape}')
print(f'Shape of the test labels {test_labels.shape}')
print(f'Number of words in first review is {len(train_data[0])}')
print(f'Number of words in the second review is {len(train_data[1])}')

# Notice about using text in machine learning
Beware, this data set contains integers representing words. With the function below we can reverse using the index.
Beware that we only took the top 1000 words. So some words cannot be reversed. These will be marked with a '?'

To understand more about reverse and INDEX_FROM, check this post: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset#

In [None]:
word_index = imdb.get_word_index()
word_index = {k:(v+INDEX_FROM) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

print(f'Using the word index, we can find the number that represents the word: "something" is "{word_index["something"]}"')
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print(f'Using the reverse word index, we can find the word that represents the number: "142" is "{reverse_word_index.get(142)}"')

In [None]:
decoded_review = " ".join([reverse_word_index.get(i, "?") for i in train_data[100]])
print(decoded_review)

In [None]:
def vectorize_sequences(sequences, dimension=NUM_WORDS):
    """Creates one-hot-encoded vector, each row contains a 1 in a column if the matching index word is within the original sentence"""
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
         results[i, sequence] = 1.
    return results

In [None]:
x_train = vectorize_sequences(train_data)
print(x_train.shape)

In [None]:
x_test = vectorize_sequences(test_data)
x_test.shape

In [None]:
y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

In [None]:
model = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, "bo", label="Training loss")
plt.plot(epochs, val_loss_values, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
plt.clf()
acc = history_dict["accuracy"]
val_acc = history_dict["val_accuracy"]
plt.plot(epochs, acc, "bo", label="Training acc")
plt.plot(epochs, val_acc, "b", label="Validation acc")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# Use the model to check a new review
In this part we will check with an exsiting review if the model can predict whether the review was positive or not

In [None]:
def text_to_index(review_text):
    words = review_text.split(" ")
    
    word_index_ids = [word_index["<START>"]]
    UNKNOWN = word_index["<UNK>"]
    for a_word in words:
        found_index = word_index.get(a_word, UNKNOWN)
        word_index_ids.append(found_index if found_index < NUM_WORDS else UNKNOWN)
    
    return word_index_ids


In [None]:
# new_review = "This is the best movie ever, I just love all the actors and the plot is better than ever."
new_review = "Wow, the worst ending ever, how can you do this, incredible. Nobody should ever look at this movie."

# First step is encoding the text using the word index
new_review_ids = text_to_index(new_review)
print(new_review_ids)
print(" ".join([reverse_word_index.get(i, "?") for i in new_review_ids]))

# vectorize the sentence
datas = np.array([new_review_ids])
new_review_vector = vectorize_sequences(datas)
# Put the found vector through the model
output = model.predict(new_review_vector)

# Check the output
print(output)