In [0]:
!pip install tensorflow-gpu==2.0.0
!pip install -q tensorflow-hub
!pip install -q tensorflow-datasets

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from tensorflow.python.client import device_lib
import numpy as np
import matplotlib.pyplot as plt

tfds.disable_progress_bar()

print("TF Version: {}".format(tf.__version__))
print("Eager model: {}".format(tf.executing_eagerly()))
print("TFHub Version: {}".format(hub.__version__))
print("TFDS Version: {}".format(tfds.__version__))
print("GPU {} available".format("is" if tf.config.experimental.list_physical_devices("GPU") else "is not"))
print("Devices: {}".format(device_lib.list_local_devices()))

# Download IMDB Datasets

List all available datasets in Tensorflow datasets.

In [0]:
tfds.list_builders()

Dwonload the IMDB datasets to the local (default: `$HOME/tensorflow_datasets/imdb_reviews`).

Split the training dataset into two sub-datasets, training and validation, in a ratio of 6:4.

In [0]:
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

In [0]:
(train_data, val_data), test_data = tfds.load(
    name="imdb_reviews", split=(train_validation_split, tfds.Split.TEST), as_supervised=True)

## Explore the Dataset

It is easy for you to take a batch of data via `batch()`.

In [0]:
train_sentences_batch, train_lables_batch = next(iter(train_data.batch(10)))

In [0]:
train_sentences_batch

The label represents which is a positive(1) or a negative(0) review.

In [0]:
train_lables_batch

# Building the Model

One of the best practices is to present the text as an embedding vector. There are several advantages, including no worrying about text preprocessing, benefiting from the transfer learning and the fixed size of the embedding vector.

The following we are going to use a pre-trained model (https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1) from `TF.hub`.

In [0]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [0]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)
hub_layer(train_sentences_batch[:3])

After creating an embedding layer for the sentences, you can create a model for classifying them.

In [0]:
def build_model(inputs):
  x = hub_layer(inputs)
  x = tf.keras.layers.Dense(16, activation='relu')(x)
  x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
  return x

In [0]:
inputs = tf.keras.Input(shape=[], dtype=tf.string)
outputs = build_model(inputs)
model = tf.keras.Model(inputs, outputs)
model.summary()

## Define the Loss and Metrics Functions

In [0]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

## Training the Model

In [0]:
model.fit(train_data.shuffle(10000).batch(512), 
          epochs=20, 
          validation_data=val_data.batch(512), 
          verbose=1)

## Evaluation

In [0]:
test_loss, test_acc = model.evaluate(test_data.batch(512), verbose=2)
print(test_loss, test_acc)

# Advacned Text Classification with Preprocessing Text

## Preprocessing Text

You can also choose to load the IMDB datasets with ~8K words.

In [0]:
(train_data_8k, test_data_8k), info = tfds.load(
    name="imdb_reviews/subwords8k", 
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    as_supervised = True,
    with_info = True    # also return the `info` structure
)

The dataset `info` includes the text encoder (`tfds.features.text.SubwordTextEncoder`).

In [0]:
encoder = info.features["text"].encoder

In [0]:
print('Vocabulary Size in the Encoder: {}.'.format(encoder.vocab_size))
print('Subwords in the Encoder: {}.'.format(encoder.subwords))

This text encoder would help to encode the string.

In [0]:
sample_str = "Hello world, Tensorflow!"

In [0]:
encoded = encoder.encode(sample_str)
encoded

In [0]:
decoded_str = encoder.decode(encoded)
decoded_str

In [0]:
assert decoded_str == sample_str

You can easily decode the encoded code.

In [0]:
for enc in encoded:
  print("{} -> {}".format(enc, encoder.decode([enc])))

## Explore Encoded Data

A quick way to get a set of data via the `batch()` method.

In [0]:
train_eg, train_lbl = next(iter(train_data_8k.batch(1)))

In [0]:
train_eg.numpy()[0][:10]

The above method can help to get a batch of data, however, it is only available for the data with the same shape, e.g. image datasets. In the text or sequence dataset, there is a better way to get a small set of data via the `take()` method.

Another way to take a number of data is the method `take()`.

In [0]:
for train_example, train_label in train_data_8k.take(10):
  print('Encoded Text: {}'.format(train_example[:10].numpy()))
  print('Label: {}'.format(train_label.numpy()))

You can decode the encoded number into a string like the operation above.

In [0]:
encoder.decode(train_example)

Notice `tfds.Datasets.output_shapes()` is deprecated(), you can access the output shapes via `tf.compat.v1.data.get_output_shapes(train_data_8k)`.

Here this model doesn't use masking, but padding. The zero-padding is used as part of the input, so the padding length may affect the output.

In [0]:
train_batches = train_data_8k.shuffle(1000).padded_batch(32, tf.compat.v1.data.get_output_shapes(train_data_8k))

In [0]:
test_batches = test_data_8k.padded_batch(32, tf.compat.v1.data.get_output_shapes(test_data_8k))

In [0]:
for example_batch, label_batch in train_batches.take(3):
  print("Batch shape {}".format(example_batch.shape))
  print("Label shape {}".format(label_batch.shape))

Each batches will have a shape of `(batch_size, sequence_length)`. The padding is dynamic on each batch.

## Building the Model

The output shape of the embedding layer is **(batch, sequence, embedding)**.

The GlobalAveragePooling1D() layer helps to handle input of variable length in the simplest way possible.

The last layer is a float between 0 and 1 representing the confidence or the probability.

In [0]:
def build_model_adv(inputs):
  x = tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=32)(inputs)
  x = tf.keras.layers.GlobalAveragePooling1D()(x)
  x = tf.keras.layers.Dense(10)(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Activation('relu')(x)
  x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
  return x

The input shape of the model was [None], the input length on each batch is variable.

In [0]:
inputs = tf.keras.Input(shape=[None])
outputs = build_model_adv(inputs=inputs)
model_adv = tf.keras.Model(inputs, outputs)
model_adv.summary()

In [0]:
model_adv.compile(optimizer='Adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

## Training the Model

In [0]:
his = model_adv.fit(train_batches, 
                    epochs=10, 
                    validation_data=test_batches, 
                    validation_steps=30)

Evaluate the model.

In [0]:
loss, accuracy = model_adv.evaluate(test_batches)
print("Loss: {}, Accuracy: {}".format(loss, accuracy))

## View the details in the training history

In [0]:
his_dict = his.history

In [0]:
his_dict.keys()

In [0]:
acc = his_dict['accuracy']
loss = his_dict['loss']
val_loss = his_dict['val_loss']
val_acc = his_dict['val_accuracy']

In [0]:
epochs = range(1, len(acc)+1)

In [0]:
plt.plot(epochs, loss, 'bo', label='Training Loss') # bo: blue dot
plt.plot(epochs, val_loss, 'b', label="Validation Loss") # b: blue
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.ylim(0, 2)
plt.legend()
plt.show()

In [0]:
plt.plot(epochs, acc, 'ro', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.show()

You can see the overfitting issue, one of the solutions to avoid it is to stop early.