# Vectorization

In [None]:
import tensorflow as tf

### Lookup tables

In [None]:
cats = tf.constant(['beach','mountain','desert'])
indices = tf.range(len(cats), dtype=tf.int64)

In [None]:
table_init = tf.lookup.KeyValueTensorInitializer(cats, indices) #paired_tensor

In [None]:
num_oov_buckets = 1
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant(['beach']))

In [None]:
tf.one_hot(table.lookup(cats), cats.shape[0]+1)

### Representation learning 

In [None]:
repr_dim = 2
cat_dim = cats.shape[0] + num_oov_buckets
repr_init = tf.random.uniform([cat_dim, repr_dim])
representation = tf.Variable(repr_init)

In [None]:
representation

In [None]:
indices = table.lookup(cats)
tf.nn.embedding_lookup(representation, indices)

#### Model with representation learning

In [None]:
regular_inputs = tf.keras.layers.Input(shape=[8], name='numeric_inputs')
categories = tf.keras.layers.Input(shape=[], dtype=tf.string, name='categorical_inputs')
cat_indices = tf.keras.layers.Lambda(lambda cats: table.lookup(cats), name='categorical_index')(categories)
cat_embed = tf.keras.layers.Embedding(input_dim=cat_dim, output_dim=2, name='embedding_layer')(cat_indices)
encoded_inputs = tf.keras.layers.concatenate([regular_inputs, cat_embed], name='processed_inputs')
outputs = tf.keras.layers.Dense(1)(encoded_inputs)

In [None]:
model = tf.keras.models.Model(
    inputs=[regular_inputs, categories],
    outputs = [outputs]
)

An embedding layer is just a dense layer, without activation function and no biases. However, the embedding layer implemented in keras contains a couple a of performance optimizations.