In [29]:
import pathlib
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = pathlib.Path(text_dir).parents[0]

## Load test into datasets

In [3]:
labeled_data_sets = []
for i, filename in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(str(parent_dir/filename))
    labeled_dataset = lines_dataset.map(lambda line: (line, tf.cast(i, tf.int64)))
    labeled_data_sets.append(labeled_dataset)

In [4]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [20]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

## Encode text lines as numbers

__Build vocabulary__

- Iterate over each example's numpy value.
- Use tfds.features.text.Tokenizer to split it into tokens.
- Collect these tokens into a Python set, to remove duplicates.
- Get the size of the vocabulary for later use.

In [30]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)

__Encode examples__

In [32]:
encoder = tfds.features.text.TokenTextEncoder(vocab_list=vocabulary_set)

In [33]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

Use `Dataset.map` to apply this function to each element of the dataset. `Dataset.map` runs in graph mode.


- Graph tensors do not have a value.
- In graph model you can only use TensorFlow Ops and functions

So you can't `.map` 'encode()' function directly: You need to wrap it in a `tf.py_funuction`. The `py_function` will pass regular tensors(with a value and a `.numpy()` method to access it), to the wrapped python function.

In [34]:
def decode_map_fn(text, label):
    encoded_text, label = tf.py_function(
        encode,
        inp=[text, label],
        Tout=[tf.int64, tf.int64]
    )
    # `tf.data.Datasets` work best if all components have a shape set
    #  so set the shapes manually:
    encoded_text.set_shape([None])
    label.set_shape([])
    
    return encoded_text, label

all_encoded_data = all_labeled_data.map(decode_map_fn)

## Split the dataset into test and train datasets

In [37]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None], []))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None], []))

In [39]:
# Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.
vocab_size += 1

## Build the model

In [47]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    *[tf.keras.layers.Dense(units, activation='relu') 
      for units in [64, 64]],
    tf.keras.layers.Dense(3)
])

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [48]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          1099456   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 195       
Total params: 1,178,115
Trainable params: 1,178,115
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [49]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1208781ad0>