In [None]:
# PREPARING THE MOVIE REVIEW DATA
# sentiment_data = https://ai.stanford.edu/~amaas/data/sentiment/
import tarfile
with tarfile.open('/content/aclImdb_v1.tar.gz', 'r:gz') as tar:
  tar.extractall()

# Preprocessing the movie dataset into a more convenient format
import pyprind
import pandas as pd
import os

# change the 'basepath' to the directory of the unzipped movie dataset
basepath = '/content/aclImdb'
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
  for l in ('pos', 'neg'):
    path = os.path.join(basepath, s, l)
    for file in sorted(os.listdir(path)):
      with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
        txt = infile.read()
      df = df.append([[txt, labels[l]]], ignore_index = True)
      pbar.update()

df.columns = ['review', 'sentiment']

# Storing the movie review dataset as a CSV file
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)
#df.shape

In [None]:
# STEP 1: CREATE A DATASET
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values)
)

# INSPECTION
for ex in ds_raw.take(3):
  print(ex[0].numpy()[0][ :50], ex[1])

In [None]:
# SPLITTING THE DATASET INTO TRAINING, TESTING & VALIDATION
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False
)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [None]:
# STEP 2: FIND UNIQUE TOKENS (WORDS)
from collections import Counter
import tensorflow as tf
import tensorflow_datasets as tfds

tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

In [None]:
# STEP 3: ENCODING UNIQUE TOKENS TO INTEGERS
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str), '\n')

# STEP 3A: DEFINE THE FUNCTION FOR TRANSFORMATION
def encode(text_tensor, label):
  text = text_tensor.numpy()[0]
  encoded_text = encoder.encode(text)
  return encoded_text, label

# STEP 3B: WRAP THE ENCODE FUNCTION TO A TF Op.
def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], 
                        Tout = (tf.int64, tf.int64))
  
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

# looking at the shape of some examples:
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
  print('Sequence length:', example[0].shape)

In [None]:
# Making the length of the sequence equal
# Take a small subset
ds_subset = ds_train.take(8)
for ex in ds_subset:
  print('Individual size:', ex[0].shape)

print('\n')
# Dividing the dataset into batches
ds_batched = ds_subset.padded_batch(
    4, padded_shapes=([-1], [])
)
for batch in ds_batched:
  print('Batch dimension:', batch[0].shape)

In [None]:
# Dividing the datasets into mini-batches with a batch-size of 32
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1], [])
)
valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1], [])
)
test_data = ds_test.padded_batch(
    32, padded_shapes=([-1], [])
)

In [None]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

# Build the model
bi_lstm_model = tf.keras.Sequential([
      tf.keras.layers.Embedding(
          input_dim=vocab_size,
          output_dim=embedding_dim,
          name='embed-layer'
      ),

      tf.keras.layers.Bidirectional(
          tf.keras.layers.LSTM(64, name='lstm-layer'),
          name='bidir-lstm'
      ),

      tf.keras.layers.Dense(64, activation='relu'),

      tf.keras.layers.Dense(1, activation='sigmoid')
])
bi_lstm_model.summary()

In [None]:
# Compile and train
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

history = bi_lstm_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs=15
)

In [None]:
# Evaluate
test_results = bi_lstm_model.evaluate(test_data)
print('Test Acc: {:.2f}%'.format(test_results[1] * 100))