In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os

# Get data

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt'] # 3 translations

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)
parent_dir

&#39;C:\\Users\\kevin\\.keras\\datasets&#39;

# Load into datasets

In [3]:
# function to be used by tf.data.Dataset.map() to label each example
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []

# load each file into its own dataset
for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [4]:
# combine the three labeled datasets into one, and shuffle

BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]

for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [5]:
# check some data in the format of (example, label)
for example in all_labeled_data.take(5):
    print(example)

(&lt;tf.Tensor: shape=(), dtype=string, numpy=b&#39;An idler all the day, or if by force&#39;&gt;, &lt;tf.Tensor: shape=(), dtype=int64, numpy=0&gt;)
(&lt;tf.Tensor: shape=(), dtype=string, numpy=b&#39;To meet his rage, for fear is on them all;&#39;&gt;, &lt;tf.Tensor: shape=(), dtype=int64, numpy=1&gt;)
(&lt;tf.Tensor: shape=(), dtype=string, numpy=b&quot;His helmet&#39;s crest, but, brass encount&#39;ring brass,&quot;&gt;, &lt;tf.Tensor: shape=(), dtype=int64, numpy=1&gt;)
(&lt;tf.Tensor: shape=(), dtype=string, numpy=b&#39;Hector hurried from the house when she had done speaking, and went down&#39;&gt;, &lt;tf.Tensor: shape=(), dtype=int64, numpy=2&gt;)
(&lt;tf.Tensor: shape=(), dtype=string, numpy=b&#39;blessed parents.&quot;&#39;&gt;, &lt;tf.Tensor: shape=(), dtype=int64, numpy=2&gt;)


# Encode text lines as numbers

### build vocabulary

In [6]:
tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set()

for text_tensor, _ in all_labeled_data:
    # tokenize the text
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    # store in a set to remove duplicate tokens
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

### encode texts

In [7]:
# use the vocab set to create encoder
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)
encoded_example = encoder.encode(example_text)
print(encoded_example)

b&#39;An idler all the day, or if by force&#39;
[14395, 12438, 9520, 3916, 15117, 8190, 10178, 12784, 7598]


In [8]:
# put encoder in a function so it can be used by Dataset.map()
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

# need to put in a tf.py_function
def encode_map_fn(text, label):
    encoded_text, label = tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

    # tf.data.Datasets is more efficient if all components have a shape set, so set manually
    encoded_text.set_shape([None])
    label.set_shape([])

    return encoded_text, label

all_encoded_data = all_labeled_data.map(encode_map_fn)

# Trainging and test set

In [9]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE) # use padded_batch because not every sample has same length of text

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [10]:
# a new token encoding is added (the 0 used for padding), so increase vocab_size
vocab_size += 1

# Build model

In [11]:
model = tf.keras.Sequential()

# embedding layer
model.add(tf.keras.layers.Embedding(vocab_size, 64))
# LSTM layer helps to understand words before and after (bidirectional)
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
# one for more dense layers, edit the list to change layers
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
# output layer
model.add(tf.keras.layers.Dense(3))

In [12]:
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train

In [13]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


&lt;tensorflow.python.keras.callbacks.History at 0x1c2745e0b50&gt;

In [14]:
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.393, Eval accuracy: 0.835
