In [3]:
# Sentiment classification between positive and negative movie reviews
# Use TFDS - tensorflow-datasets
# !pip install -q tensorflow-datasets

import tensorflow_datasets as tfds

# Load the IMDB reviews dataset
# with_info=True -> If you want to see the description of the dataset
# as_supervised=True -> to load the data as (input, label) pairs
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

print(info)

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to ~\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…





HBox(children=(IntProgress(value=0, description='Generating splits...', max=3, style=ProgressStyle(description…

HBox(children=(IntProgress(value=1, bar_style='info', description='Generating train examples...', max=1, style…

HBox(children=(IntProgress(value=0, description='Shuffling ~\\tensorflow_datasets\\imdb_reviews\\plain_text\\1…

HBox(children=(IntProgress(value=1, bar_style='info', description='Generating test examples...', max=1, style=…

HBox(children=(IntProgress(value=0, description='Shuffling ~\\tensorflow_datasets\\imdb_reviews\\plain_text\\1…

HBox(children=(IntProgress(value=1, bar_style='info', description='Generating unsupervised examples...', max=1…

HBox(children=(IntProgress(value=0, description='Shuffling ~\\tensorflow_datasets\\imdb_reviews\\plain_text\\1…

Dataset imdb_reviews downloaded and prepared to ~\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='~\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supe

In [5]:
# Split the dataset

# Print the contents of the dataset
print(imdb)

print()
# Take 2 training examples and print its contects
for i in imdb['train'].take(2):
    print(i)

{Split('train'): <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>, Split('test'): <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>, Split('unsupervised'): <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit thr

In [7]:
import numpy as np

# Get the train and test sets
train_data, test_data = imdb['train'], imdb['test']

# Init sentences and labels lists
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# Loop over all the training examples and save the sentences and labels
for s,l in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())
    
# Loop over all the testing examples and save the sentences and labels
for s,l in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())
    
# Convert labels list to numpy array
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [8]:
# Generate Padded Sequences

# Parameters
vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type = 'post'
oov_tok = "<OOV>"

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Init Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)

In [11]:
# Build and compile the sentiment Model
# Use Embedding to represent each word in the vocabulary with vectors
# These vectors have trainable weights so as your NN learns
# Words that are most likely to appear in positive reviews will converge towards to similar weigths
# After the Embedding layer, flatten its output and feed it into a Dense layer
# The output is a single sigmoid neuron for 2 classes, use binary_crossentropy as loss function

import tensorflow as tf

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Setup the training paramenters
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Train the model
# Adjust parameter to get 1 and ~83%

num_epochs = 10

# Trina the model
model.fit(padded, training_labels_final, epochs=num_epochs, 
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x213859b4d08>

In [17]:
# Visualize Word Embeddings

# Tensorflow Embedding Projector

# Get embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

print(embedding_weights.shape)

(10000, 16)


In [18]:
# Generate two files
# vecs.tsv -> contains the vector weights of each word in the vocabulary
# meta.tsv -> contains the words in the vocabulary

# reverse the word index to quickly lookup a word based on a given number
reverse_word_index = tokenizer.index_word

In [20]:
# Generate files with a loop
# Loop vocab_size-1, skipping the 0 key because it is just for the padding

import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Init loop
for word_num in range(1, vocab_size):
    # Get the word associated at the current index
    word_name = reverse_word_index[word_num]
    
    # Get the embedding weights associated with the current index
    word_embedding = embedding_weights[word_num]
    
    # Write the word name
    out_m.write(word_name + "\n")
    
    # Write the word embedding
    out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")
    
# Close the files
out_v.close()
out_m.close()

In [None]:
# https://projector.tensorflow.org/