In [1]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

'C:\\Users\\jdaaa\\.keras\\datasets'

In [3]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [4]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [5]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [6]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b"The sage Ulysses promis'd in thy tent:">, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b"Jove to Olympus, to th' abode of Gods,">, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'But what my single arm, and feet, and strength'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=86, shape=(), dtype=string, numpy=b'"Whither away? what madness fills your breasts?'>, <tf.Tensor: id=87, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=90, shape=(), dtype=string, numpy=b'spears in front of them, for Ajax had given them strict orders that no'>, <tf.Tensor: id=91, shape=(), dtype=int64, numpy=2>)


In [7]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

In [8]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [9]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b"The sage Ulysses promis'd in thy tent:"


In [10]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[13150, 16448, 2534, 6586, 1103, 15989, 7088, 1091]


In [11]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [12]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [13]:
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]

W0629 19:40:07.549597  7296 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0629 19:40:07.565775  7296 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int64
W0629 19:40:07.570452 13748 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0629 19:40:07.579900 13748 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int64
W0629 19:40:07.587361  7296 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


(<tf.Tensor: id=149240, shape=(16,), dtype=int64, numpy=
 array([13150, 16448,  2534,  6586,  1103, 15989,  7088,  1091,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int64)>,
 <tf.Tensor: id=149244, shape=(), dtype=int64, numpy=1>)

In [14]:
vocab_size += 1

In [15]:
model = tf.keras.Sequential()

In [16]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [17]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [18]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

In [19]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3


W0629 19:40:39.080477  5856 deprecation.py:323] From c:\users\jdaaa\pycharmprojects\textminingproject\venv\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


    169/Unknown - 32s 32s/step - loss: 1.1026 - accuracy: 0.20 - 32s 16s/step - loss: 1.1001 - accuracy: 0.29 - 32s 11s/step - loss: 1.0984 - accuracy: 0.31 - 33s 8s/step - loss: 1.0961 - accuracy: 0.3438 - 33s 7s/step - loss: 1.0936 - accuracy: 0.362 - 35s 6s/step - loss: 1.0914 - accuracy: 0.372 - 36s 5s/step - loss: 1.0887 - accuracy: 0.379 - 36s 5s/step - loss: 1.0871 - accuracy: 0.377 - 36s 4s/step - loss: 1.0861 - accuracy: 0.381 - 36s 4s/step - loss: 1.0823 - accuracy: 0.387 - 37s 3s/step - loss: 1.0786 - accuracy: 0.390 - 37s 3s/step - loss: 1.0758 - accuracy: 0.389 - 37s 3s/step - loss: 1.0734 - accuracy: 0.387 - 37s 3s/step - loss: 1.0700 - accuracy: 0.387 - 37s 2s/step - loss: 1.0680 - accuracy: 0.386 - 37s 2s/step - loss: 1.0629 - accuracy: 0.386 - 37s 2s/step - loss: 1.0559 - accuracy: 0.392 - 37s 2s/step - loss: 1.0536 - accuracy: 0.388 - 37s 2s/step - loss: 1.0490 - accuracy: 0.391 - 38s 2s/step - loss: 1.0457 - accuracy: 0.395 - 39s 2s/step - loss: 1.0419 - accuracy: 0.

    336/Unknown - 48s 281ms/step - loss: 0.7000 - accuracy: 0.62 - 48s 280ms/step - loss: 0.7004 - accuracy: 0.62 - 48s 278ms/step - loss: 0.6993 - accuracy: 0.62 - 48s 277ms/step - loss: 0.6984 - accuracy: 0.62 - 48s 276ms/step - loss: 0.6975 - accuracy: 0.62 - 48s 275ms/step - loss: 0.6965 - accuracy: 0.62 - 48s 273ms/step - loss: 0.6961 - accuracy: 0.62 - 48s 272ms/step - loss: 0.6958 - accuracy: 0.62 - 48s 271ms/step - loss: 0.6953 - accuracy: 0.62 - 48s 270ms/step - loss: 0.6949 - accuracy: 0.62 - 48s 268ms/step - loss: 0.6944 - accuracy: 0.62 - 48s 267ms/step - loss: 0.6936 - accuracy: 0.62 - 48s 266ms/step - loss: 0.6927 - accuracy: 0.62 - 48s 265ms/step - loss: 0.6920 - accuracy: 0.62 - 49s 264ms/step - loss: 0.6908 - accuracy: 0.62 - 49s 263ms/step - loss: 0.6904 - accuracy: 0.62 - 49s 261ms/step - loss: 0.6898 - accuracy: 0.62 - 49s 260ms/step - loss: 0.6907 - accuracy: 0.62 - 49s 259ms/step - loss: 0.6906 - accuracy: 0.62 - 49s 258ms/step - loss: 0.6904 - accuracy: 0.62 - 49

    503/Unknown - 57s 169ms/step - loss: 0.6167 - accuracy: 0.68 - 57s 169ms/step - loss: 0.6160 - accuracy: 0.68 - 57s 169ms/step - loss: 0.6154 - accuracy: 0.68 - 57s 168ms/step - loss: 0.6147 - accuracy: 0.68 - 57s 168ms/step - loss: 0.6143 - accuracy: 0.68 - 57s 168ms/step - loss: 0.6138 - accuracy: 0.68 - 57s 167ms/step - loss: 0.6133 - accuracy: 0.68 - 57s 167ms/step - loss: 0.6130 - accuracy: 0.68 - 58s 167ms/step - loss: 0.6127 - accuracy: 0.68 - 58s 166ms/step - loss: 0.6126 - accuracy: 0.68 - 58s 166ms/step - loss: 0.6121 - accuracy: 0.68 - 58s 166ms/step - loss: 0.6116 - accuracy: 0.68 - 58s 166ms/step - loss: 0.6111 - accuracy: 0.68 - 58s 165ms/step - loss: 0.6107 - accuracy: 0.68 - 58s 165ms/step - loss: 0.6103 - accuracy: 0.68 - 58s 165ms/step - loss: 0.6094 - accuracy: 0.68 - 58s 164ms/step - loss: 0.6089 - accuracy: 0.68 - 58s 164ms/step - loss: 0.6086 - accuracy: 0.68 - 58s 164ms/step - loss: 0.6080 - accuracy: 0.68 - 58s 163ms/step - loss: 0.6075 - accuracy: 0.68 - 58

    670/Unknown - 67s 132ms/step - loss: 0.5590 - accuracy: 0.72 - 67s 132ms/step - loss: 0.5588 - accuracy: 0.72 - 67s 132ms/step - loss: 0.5584 - accuracy: 0.72 - 67s 132ms/step - loss: 0.5582 - accuracy: 0.72 - 67s 132ms/step - loss: 0.5580 - accuracy: 0.72 - 67s 132ms/step - loss: 0.5577 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5576 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5575 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5574 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5574 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5569 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5566 - accuracy: 0.72 - 67s 131ms/step - loss: 0.5563 - accuracy: 0.72 - 67s 130ms/step - loss: 0.5560 - accuracy: 0.72 - 67s 130ms/step - loss: 0.5556 - accuracy: 0.72 - 68s 130ms/step - loss: 0.5552 - accuracy: 0.72 - 68s 130ms/step - loss: 0.5551 - accuracy: 0.72 - 68s 130ms/step - loss: 0.5547 - accuracy: 0.72 - 68s 130ms/step - loss: 0.5546 - accuracy: 0.72 - 68s 130ms/step - loss: 0.5545 - accuracy: 0.72 - 68

Epoch 2/3


In [None]:
print("")
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))

In [None]:
tf.__version__