## RNN w/ LSTM Layer

In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

### Load iMDB Review Dataset

In [60]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

### Inspect training text / label

In [12]:
print(type(train_dataset), '\n')
for example, label in train_dataset.take(4):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'> 

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0
text:  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. Ho

### Generate Batches

In [61]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [62]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [59]:
for example, label in train_dataset.take(1):
    print('labels: ', label.numpy())

labels:  [1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0
 1 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0]


In [23]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [24]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [25]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[101, 576, 127, ...,   0,   0,   0],
       [176, 316,   2, ...,   0,   0,   0],
       [  1,   1,   1, ..., 168,   1, 331]])

### Create and Fit Model

In [29]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [32]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [33]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluation and Prediction

In [34]:
test_loss, test_acc = model.evaluate(test_dataset)



In [77]:
sample_text = ("It was splendid. Definitely will watch it again. So cool.")
prediction = model.predict(np.array([sample_text]))[0][0]
print(f'input text: {sample_text}')
print(f'prediction: {prediction}\n')

sample_text = ("Such an underrated film. I hope this movie grabs more attention it deserves.")
prediction = model.predict(np.array([sample_text]))[0][0]
print(f'input text: {sample_text}')
print(f'prediction: {prediction}')

input text: It was splendid. Definitely will watch it again. So cool.
prediction: 1.0850709676742554

input text: Such an underrated film. I hope this movie grabs more attention it deserves.
prediction: 0.9897462129592896


In [76]:
sample_text = ("Literal trash. I don't recommend this movie at all")
prediction = model.predict(np.array([sample_text]))[0][0]
print(f'input text: {sample_text}')
print(f'prediction: {prediction}\n')

sample_text = ("It's a joke somebody even like this. Give yourself a favor and don't waste a time on this.")
prediction = model.predict(np.array([sample_text]))[0][0]
print(f'input text: {sample_text}')
print(f'prediction: {prediction}')

input text: Literal trash. I don't recommend this movie at all
prediction: 0.037662800401449203

input text: It's a joke somebody even like this. Give yourself a favor and don't waste a time on this.
prediction: -2.3889565467834473
