Capas de preprocesamiento de características categóricas --- 0:00 min
===

* Última modificación: Marzo 7, 2022 | YouTube

* Adaptado de: https://keras.io/api/layers/preprocessing_layers/numerical

Importación de librerías
---

In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf

tf.__version__

'2.8.0'

Capa CategoryEncoding
---

In [2]:
#
# One-hot encoding
#
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4,
    output_mode="one_hot",
)
layer([3, 2, 0, 1])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

In [3]:
#
# Multi-hot encoding
#
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4,
    output_mode="multi_hot",
)
layer([[0, 1], [0, 0], [1, 2], [3, 1]])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.]], dtype=float32)>

In [4]:
import numpy as np

#
# `count` mode
#
layer = tf.keras.layers.CategoryEncoding(
    num_tokens=4,
    output_mode="count",
)
count_weights = np.array(
    [
        [0.1, 0.2],
        [0.1, 0.1],
        [0.2, 0.3],
        [0.4, 0.2],
    ]
)
layer(
    [
        [0, 1],
        [0, 0],
        [1, 2],
        [3, 1],
    ],
    count_weights=count_weights,
)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0.1, 0.2, 0. , 0. ],
       [0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0.3, 0. ],
       [0. , 0.2, 0. , 0.4]], dtype=float32)>

Capa Hashing
---

In [5]:
#
# FarmHash64
#
layer = tf.keras.layers.Hashing(
    num_bins=3,
)
inp = [
    ["A"],
    ["B"],
    ["C"],
    ["D"],
    ["E"],
]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [0],
       [1],
       [1],
       [2]])>

In [6]:
#
# FarmHas64 con mask
#
layer = tf.keras.layers.Hashing(
    num_bins=3,
    mask_value="",
)
inp = [
    ["A"],
    ["B"],
    [""],
    ["C"],
    ["D"],
]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [1],
       [0],
       [2],
       [2]])>

In [7]:
#
# SipHash64
#
layer = tf.keras.layers.Hashing(
    num_bins=3,
    salt=[133, 137],
)
inp = [
    ["A"],
    ["B"],
    ["C"],
    ["D"],
    ["E"],
]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[1],
       [2],
       [1],
       [0],
       [2]])>

In [8]:
#
# Siphash64 con un entero, o salt=[133, 133]
#
layer = tf.keras.layers.Hashing(
    num_bins=3,
    salt=133,
)
inp = [
    ["A"],
    ["B"],
    ["C"],
    ["D"],
    ["E"],
]
layer(inp)

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[0],
       [0],
       [2],
       [1],
       [0]])>

Capa StringLookup
--

In [9]:
#
# Creación de una capa con vocabulario pre-existente
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(
    [
        ["a", "c", "d"],
        ["d", "z", "b"],
    ]
)
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[1, 3, 4],
       [4, 0, 2]])>

In [10]:
#
# Creación de una capa con vocabulario adaptado
#
data = tf.constant(
    [
        ["a", "c", "d"],
        ["d", "z", "b"],
    ]
)
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer.get_vocabulary()

['[UNK]', 'd', 'z', 'c', 'b', 'a']

In [11]:
data = tf.constant(
    [
        ["a", "c", "d"],
        ["d", "z", "b"],
    ]
)
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[5, 3, 1],
       [1, 2, 4]])>

In [12]:
#
# Multiples OOV indices
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(
    [
        ["a", "c", "d"],
        ["m", "z", "b"],
    ]
)
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
    num_oov_indices=2,
)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[2, 4, 5],
       [0, 1, 3]])>

In [13]:
#
# One-hot output
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(["a", "b", "c", "d", "z"])
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
    output_mode="one_hot",
)
layer(data)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)>

In [14]:
#
# Multi-hot output
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(
    [
        ["a", "c", "d", "d"],
        ["d", "z", "b", "z"],
    ]
)
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
    output_mode="multi_hot",
)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 1.],
       [1., 0., 1., 0., 1.]], dtype=float32)>

In [15]:
#
# Token count output
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(
    [
        ["a", "c", "d", "d"],
        ["d", "z", "b", "z"],
    ]
)
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
    output_mode="count",
)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 2.],
       [2., 0., 1., 0., 1.]], dtype=float32)>

In [16]:
#
# TF-IDF output
#
vocab = ["a", "b", "c", "d"]
idf_weights = [0.25, 0.75, 0.6, 0.4]
data = tf.constant(
    [
        ["a", "c", "d", "d"],
        ["d", "z", "b", "z"],
    ]
)
layer = tf.keras.layers.StringLookup(
    output_mode="tf_idf",
)
layer.set_vocabulary(vocab, idf_weights=idf_weights)
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
       [1.  , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>

In [17]:
#
# Búsqueda inversa
#
vocab = ["a", "b", "c", "d"]
data = tf.constant(
    [
        [1, 3, 4],
        [4, 0, 2],
    ]
)
layer = tf.keras.layers.StringLookup(
    vocabulary=vocab,
    invert=True,
)
layer(data)

<tf.Tensor: shape=(2, 3), dtype=string, numpy=
array([[b'a', b'c', b'd'],
       [b'd', b'[UNK]', b'b']], dtype=object)>