# Hello RNN Sentiment Analyzer

Sentiment Analysis of the IMDB movie dataset

In [1]:
# Prerequisites
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("Python Version: ", sys.version)
print("Numpy Version: ", np.__version__)
print("Pandas Version: ", pd.__version__)
print("TensorFlow Version: ", tf.__version__)

Python Version:  3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Numpy Version:  2.0.2
Pandas Version:  2.2.3
TensorFlow Version:  2.18.0


### Get IMDB Data

In [2]:
import tensorflow_datasets as tfds

ds_train_raw, ds_val_raw, ds_test_raw = tfds.load(
    name="imdb_reviews", split=["train[:90%]", "train[90%:]", "test"], as_supervised=True
)
tf.random.set_seed(42)
ds_train = ds_train_raw.shuffle(5000, seed=42).batch(32).prefetch(1)
ds_val = ds_val_raw.batch(32).prefetch(1)
df_test = ds_test_raw.batch(32).prefetch(1)

  from .autonotebook import tqdm as notebook_tqdm


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Jari\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [00:09<00:00,  8.31 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:09<00:00,  9.63s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to C:\Users\Jari\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m




Look at a few example reviews and labels

In [4]:
for review, label in ds_train_raw.take(8):
    print(f"REVIEW: {review.numpy().decode("utf-8")[:300]} ...")
    print(f"LABEL:{label.numpy()}")

REVIEW: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda pi ...
LABEL:0
REVIEW: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Cons ...
LABEL:0
REVIEW: Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to e ...
LABEL:0
REVIEW: This is the kind of film for 

### Vectorize

In [5]:
vocab_size = 1000
text_vec_layer = keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(ds_train.map(lambda reviews, labels: reviews))

### Build and Train model

In [6]:
embed_size = 128
tf.random.set_seed(42)
model = keras.Sequential([
    text_vec_layer,
    keras.layers.Embedding(vocab_size, embed_size),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [7]:
model.summary()

In [8]:
history = model.fit(ds_train, validation_data=ds_val, epochs=2)

Epoch 1/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 376ms/step - accuracy: 0.4937 - loss: 0.6940 - val_accuracy: 0.5016 - val_loss: 0.6929
Epoch 2/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 295ms/step - accuracy: 0.5033 - loss: 0.6941 - val_accuracy: 0.5016 - val_loss: 0.6928


Validation accuracy only 50%

### Test model

In [21]:

sample_review = tf.constant(["This movie was awesome!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Review: "[b'This movie was awesome!']"
 → Sentiment: Positive (0.5120)



In [22]:
sample_review = tf.constant(["This movie is the worst ever!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Review: "[b'This movie is the worst ever!']"
 → Sentiment: Negative (0.4350)



In [23]:
sample_review = tf.constant(["I liked this movie!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Review: "[b'I liked this movie!']"
 → Sentiment: Negative (0.4441)



Not great results :-( The reviews have different lengths and when the TextVectorization layer converts them to sequences of token IDs, it pads the shorter sequences using the padding token (with ID 0) to make them as long as the longest sequence in the batch. As a result, most sequences end with many padding tokens—often dozens or even hundreds of them. Even though we’re using a GRU layer, which is much better than a SimpleRNN layer, its short-term memory is still not great, so when it goes through many padding tokens, it ends up forgetting what the review was about.   

### Try Masking to ignore the padding tokens!

In [25]:
embed_size = 128
tf.random.set_seed(42)
model = keras.Sequential([
    text_vec_layer,
    keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [26]:
history = model.fit(ds_train, validation_data=ds_val, epochs=2)

Epoch 1/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 301ms/step - accuracy: 0.6565 - loss: 0.6029 - val_accuracy: 0.8280 - val_loss: 0.4127
Epoch 2/2
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 305ms/step - accuracy: 0.8111 - loss: 0.4296 - val_accuracy: 0.8588 - val_loss: 0.3281


Now Validation Accuracy is 85%!

In [31]:
sample_review = tf.constant(["This movie is good!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Review: "[b'This movie is good!']"
 → Sentiment: Positive (0.7255)



In [28]:
sample_review = tf.constant(["This movie is the worst ever!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Review: "[b'This movie is the worst ever!']"
 → Sentiment: Negative (0.0417)



In [29]:
sample_review = tf.constant(["I liked this movie!"])
y_pred = model.predict(sample_review)
sentiment = "Positive" if y_pred[0][0] > 0.5 else "Negative"
print(f"Review: \"{sample_review}\"\n → Sentiment: {sentiment} ({y_pred[0][0]:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Review: "[b'I liked this movie!']"
 → Sentiment: Positive (0.7182)

