Predicción del autor de una traducción --- 0:00 min
===

* Última modificación: Marzo 1, 2022 | YouTube

Importación de librerías
---

In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow_text as tf_text

Descarga de datos
---

In [2]:
import pathlib

DIRECTORY_URL = "https://storage.googleapis.com/download.tensorflow.org/data/illiad/"

FILE_NAMES = [
    "cowper.txt",
    "derby.txt",
    "butler.txt",
]

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(
        name, origin=DIRECTORY_URL + name, cache_subdir="/tmp/authors"
    )

parent_dir = pathlib.Path(text_dir).parent
list(parent_dir.iterdir())

[PosixPath('/tmp/authors/derby.txt'),
 PosixPath('/tmp/authors/butler.txt'),
 PosixPath('/tmp/authors/cowper.txt')]

Carga de los datos
---

In [3]:
def labeler(example, index):
    #
    # Convierte el indice a un int64
    #
    return example, tf.cast(index, tf.int64)

In [4]:
#
# En esa lista se agregan todos los archivos
#
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    #
    # Cada línea de texto es un registro en el dataset (no el archivo como se
    # hizo antes)
    #
    lines_dataset = tf.data.TextLineDataset(str(parent_dir / file_name))

    #
    # Agrega la etiquea a cada línea de texto
    #
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))

    #
    # Adiciona las líneas etiquetadas
    #
    labeled_data_sets.append(labeled_dataset)

In [5]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000

In [6]:
#
# Crea un único dataset
#
all_labeled_data = labeled_data_sets[0]

for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE,
    reshuffle_each_iteration=False,
)

In [7]:
for text, label in all_labeled_data.take(5):
    print("Sentence: ", text.numpy())
    print("Label:", label.numpy())

Sentence:  b"servant that was the city's herald with him. Then she saw him that was"
Label: 2
Sentence:  b'To soothe the awful Goddess? Tell me true.'
Label: 0
Sentence:  b"And from his father's vineyard captive borne:"
Label: 1
Sentence:  b'come out of the fight. Agamemnon, king of men, sacrificed a fat'
Label: 2
Sentence:  b'The onset of AEneas, swift in fight,'
Label: 1


Prepración del dataset para entrenamiento
---

In [8]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [9]:
def tokenize(text, unused_label):
    lower_case = tf_text.case_fold_utf8(text)
    return tokenizer.tokenize(lower_case)

In [10]:
tokenized_ds = all_labeled_data.map(tokenize)

In [11]:
for text_batch in tokenized_ds.take(5):
    print("Tokens: ", text_batch.numpy())

Tokens:  [b'servant' b'that' b'was' b'the' b'city' b"'" b's' b'herald' b'with'
 b'him' b'.' b'then' b'she' b'saw' b'him' b'that' b'was']
Tokens:  [b'to' b'soothe' b'the' b'awful' b'goddess' b'?' b'tell' b'me' b'true'
 b'.']
Tokens:  [b'and' b'from' b'his' b'father' b"'" b's' b'vineyard' b'captive' b'borne'
 b':']
Tokens:  [b'come' b'out' b'of' b'the' b'fight' b'.' b'agamemnon' b',' b'king' b'of'
 b'men' b',' b'sacrificed' b'a' b'fat']
Tokens:  [b'the' b'onset' b'of' b'aeneas' b',' b'swift' b'in' b'fight' b',']


In [12]:
AUTOTUNE = tf.data.AUTOTUNE


def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)


tokenized_ds = configure_dataset(tokenized_ds)

In [15]:
from collections import defaultdict

VOCAB_SIZE = 10000

vocab_dict = defaultdict(lambda: 0)

for toks in tokenized_ds.as_numpy_iterator():
    for tok in toks:
        vocab_dict[tok] += 1

vocab = sorted(
    vocab_dict.items(),
    key=lambda x: x[1],
    reverse=True,
)

vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)

print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

Vocab size:  10000
First five vocab entries: [b',', b'the', b'and', b"'", b'of']


In [16]:
keys = vocab
values = range(2, len(vocab) + 2)  # Reserve `0` for padding, `1` for OOV tokens.

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64
)

num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

In [17]:
def preprocess_text(text, label):
    standardized = tf_text.case_fold_utf8(text)
    tokenized = tokenizer.tokenize(standardized)
    vectorized = vocab_table.lookup(tokenized)
    return vectorized, label

In [18]:
all_encoded_data = all_labeled_data.map(preprocess_text)

Conjuntos de entrenamiento y prueba
---

In [19]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [20]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [21]:
#
# Ejemplo
#
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

Text batch shape:  (64, 18)
Label batch shape:  (64,)
First text example:  tf.Tensor(
[2263   23   36    3  149    5   29 1007   14   16    7   33   69  200
   16   23   36    0], shape=(18,), dtype=int64)
First label example:  tf.Tensor(2, shape=(), dtype=int64)


In [22]:
vocab_size += 2

In [23]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

Entrenamiento del modelo
---

In [25]:
def create_model(vocab_size, num_labels):

    model = tf.keras.Sequential(
        [
            tf.keras.layers.Embedding(
                vocab_size,
                64,
                mask_zero=True,
            ),
            tf.keras.layers.Conv1D(
                64,
                5,
                padding="valid",
                activation="relu",
                strides=2,
            ),
            tf.keras.layers.GlobalMaxPooling1D(),
            tf.keras.layers.Dense(num_labels),
        ]
    )

    return model

In [27]:
model = create_model(vocab_size=vocab_size, num_labels=3)

model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

history = model.fit(train_data, validation_data=validation_data, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  0.39201173186302185
Accuracy: 84.50%


Exportación del modelo
---

In [31]:
MAX_SEQUENCE_LENGTH = 250

preprocess_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    standardize=tf_text.case_fold_utf8,
    split=tokenizer.tokenize,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LENGTH,
)

preprocess_layer.set_vocabulary(vocab)

In [34]:
export_model = tf.keras.Sequential(
    [
        preprocess_layer,
        model,
        tf.keras.layers.Activation("sigmoid"),
    ]
)

export_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"],
)

In [35]:
# Create a test dataset of raw strings.
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)

loss, accuracy = export_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  0.5805197358131409
Accuracy: 77.76%


Ejecución sobre nuevos datos
---

In [36]:
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]

predicted_scores = export_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis=1)

for input, label in zip(inputs, predicted_labels):
    print("Question: ", input)
    print("Predicted label: ", label.numpy())

Question:  Join'd to th' Ionians with their flowing robes,
Predicted label:  1
Question:  the allies, and his armour flashed about him so that he seemed to all
Predicted label:  2
Question:  And with loud clangor of his arms he fell.
Predicted label:  0
