In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from keras import layers
from keras import losses

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [5]:
# Remove unsupervised training data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [6]:
batch_size, seed = 32, 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train/',
    batch_size=batch_size,
    validation_split=.2,
    subset='training',
    seed=seed,
    )


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [12]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Review b'Great movie - especially the music - Etta James - "At Last". This speaks volumes when you have finally found that special someone.'
Label 0
Review b"I am shocked. Shocked and dismayed that the 428 of you IMDB users who voted before me have not given this film a rating of higher than 7. 7?!?? - that's a C!. If I could give FOBH a 20, I'd gladly do it. This film ranks high atop the pantheon of modern comedy, alongside Half Baked and Mallrats, as one of the most hilarious films of all time. If you know _anything_ about rap music - YOU MUST SEE THIS!! If you know nothing about rap music - learn something!, and then see this! Comparisons to 'Spinal Tap' fail to appreciate the inspired genius of this unique film. If you liked Bob Roberts, you'll love this. Watch it and vote it a 10!"
Label 1
Review b'What a lovely heart warming television movie. The story tells of a little five year old girl who has lost her daddy and finds it impossible to cope. Her mother is also very distressed .

In [11]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


In [13]:
# Create validation split
validation = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train/',
    batch_size=batch_size,
    validation_split=.2,
    subset='validation',
    seed=seed,
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [15]:
# Create test data
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test/',
    batch_size=batch_size
)

Found 25000 files belonging to 2 classes.


In [16]:
def custom_standardization(input_data):
    """
    To remove html tags
    """
    lowercase = tf.strings.lower(input_data)
    # Remove br tags, in practice would be probably more difficult
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [18]:
# Create text vectorization layer - this layer is used to preprocess the text 
# and convert it to the vector that can be processed by the model

# Define max number of features the model can have
max_features = 100_000

# Define sequence length of the vector - i.e. the dimensionality of the layer
sequence_len = 1024

# Create the layer - use custom standardization and default split function
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int', # output each word as unique integer
    output_sequence_length=sequence_len, # output sequence of fixed length
)


In [19]:
# Get only features from the train data
train_text = raw_test_ds.map(lambda x, y: x)

# Adapt the layer to the train data
vectorize_layer.adapt(train_text)

In [20]:
# Let's see how the layer works
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label # return the tuple back

In [22]:
text_batch, label_batch = next(iter(raw_train_ds))
x1, y1 = text_batch[0], label_batch[0]

vectorized_x1, y1 = vectorize_text(x1, y1)

print(f'Review: {x1}')
print(f'Label: {y1}')
print(f'Review after vectorization: shape: {vectorized_x1.shape}, vector: {vectorized_x1}')

Review: b"I went to see Fever Pitch with my Mom, and I can say that we both loved it. It wasn't the typical romantic comedy where someone is pining for the other, and blah blah blah... You weren't waiting for the climatic first kiss or for them to finally get together. It was more real, because you saw them through the relationship, rather than the whole movie be about them getting together. People could actually relate to the film, because it didn't seem like extraordinary circumstances, or impossible situations. It was really funny, and I think it was Jimmy Fallon's best performance. All in all... I would definitely recommend it!"
Label: 1
Review after vectorization: shape: (1, 1024), vector: [[ 10 393   6 ...   0   0   0]]


In [None]:
# On normal OS check what each word means via get_vocabulary(integer_id) - Windows is dum and throws byte err

In [29]:
# Preprocess the dataset
train_ds = raw_train_ds.map(vectorize_text)
val_ds = validation.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [30]:
AUTOTUNE = tf.data.AUTOTUNE

# Configure IO to not block during training and save as much possible in RAM
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [31]:
# Create simple neural network for classification
embedding_dim = 16

model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Dropout(.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(.2),
    layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          1600016   
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 1,600,033
Trainable params: 1,600,033
Non-trainable params: 0
______________________________________________