# Libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
tf.__version__

'2.5.0-dev20201025'

# Setup

Define how long we want the vector embedding to be:

In [3]:
EMBED_SIZE = 5

Define a vocabulary that maps a word to an integer:

In [4]:
vocab = {
    "never": 0,
    "a": 1,
    "good": 2,
    "world": 3,
    "am": 4,
    "bye": 5,
    "now": 6,
    "cat": 7,
    "hat": 8,
    "or": 9
}

pd.DataFrame({"word": vocab.keys(), "id": vocab.values()})

Unnamed: 0,word,id
0,never,0
1,a,1
2,good,2
3,world,3
4,am,4
5,bye,5
6,now,6
7,cat,7
8,hat,8
9,or,9


In [5]:
VOCAB_SIZE = len(vocab)

print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 10


Define the `VOCAB_SIZE` X `EMBED_SIZE` embedding table:

In [6]:
embedding_table = np.random.randint(0, 10, size = (VOCAB_SIZE, EMBED_SIZE))
embedding_table = tf.convert_to_tensor(embedding_table, tf.float32)
print(embedding_table)

tf.Tensor(
[[7. 3. 2. 6. 8.]
 [0. 3. 7. 9. 3.]
 [7. 7. 8. 7. 0.]
 [0. 8. 8. 6. 9.]
 [5. 6. 7. 6. 3.]
 [3. 3. 5. 8. 6.]
 [9. 3. 4. 4. 1.]
 [8. 8. 3. 5. 0.]
 [7. 1. 2. 5. 1.]
 [4. 5. 4. 3. 9.]], shape=(10, 5), dtype=float32)


# Mechanics

Define two pieces of text we would like to convert to embeddings:

In [7]:
texts = [
    "now or never",
    "good bye world"
]

for i, text in enumerate(texts, 1):
    print(f"text {i}: {text}")

text 1: now or never
text 2: good bye world


Tokenize those texts into words:

In [8]:
tokenized_texts = [text.split(' ') for text in texts]

for i, tokens in enumerate(tokenized_texts, 1):
    print(f"text {i}: {tokens}")

text 1: ['now', 'or', 'never']
text 2: ['good', 'bye', 'world']


Map each word to its id in the vocabulary:

In [9]:
word_ids = [[vocab[word] for word in tokenized_text] 
            for tokenized_text in tokenized_texts]

for i, tokens in enumerate(word_ids, 1):
    print(f"text {i}: {tokens}")

text 1: [6, 9, 0]
text 2: [2, 5, 3]


We can use a matrix to represent the sequence of word ids:

In [10]:
word_ids = tf.convert_to_tensor(word_ids)
print(word_ids)THe 

tf.Tensor(
[[6 9 0]
 [2 5 3]], shape=(2, 3), dtype=int32)


Each word id can be converted into a one-hot encoded vector of length `VOCAB_SIZE`:

In [11]:
ohe_ids = tf.one_hot(word_ids, VOCAB_SIZE)

print(ohe_ids)

tf.Tensor(
[[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]], shape=(2, 3, 10), dtype=float32)


To do the lookup, take each one-hot encoded word id and matrix multiply it with the embedding table:

In [12]:
text_embded_manual = tf.matmul(ohe_ids, embedding_table)
print(text_embded_manual)

tf.Tensor(
[[[9. 3. 4. 4. 1.]
  [4. 5. 4. 3. 9.]
  [7. 3. 2. 6. 8.]]

 [[7. 7. 8. 7. 0.]
  [3. 3. 5. 8. 6.]
  [0. 8. 8. 6. 9.]]], shape=(2, 3, 5), dtype=float32)


We can verify that by manually doing the lookup ourselves:

In [13]:
# just verify for text 1
expected_output = text_embded_manual[0]

text1_ids = word_ids[0]

# step 1: do the lookup against the embedding table for each id
actual_output = [embedding_table[id] for id in text1_ids]

# step 2: add a row dimension
actual_output = [tf.expand_dims(emb, 0) for emb in actual_output]

# step 3: concatenate at the row dimension
actual_output = tf.concat(actual_output, axis=0)

# verify
assert tf.experimental.numpy.array_equal(expected_output, actual_output)

# Shortcuts

Recall the content of `word_ids`:

In [14]:
print(word_ids)

tf.Tensor(
[[6 9 0]
 [2 5 3]], shape=(2, 3), dtype=int32)


## Method 1

Using `tf.nn.embedding_lookup`:

In [15]:
text_embed_shortcut1 = tf.nn.embedding_lookup(embedding_table, word_ids)
print(text_embed_shortcut1)

tf.Tensor(
[[[9. 3. 4. 4. 1.]
  [4. 5. 4. 3. 9.]
  [7. 3. 2. 6. 8.]]

 [[7. 7. 8. 7. 0.]
  [3. 3. 5. 8. 6.]
  [0. 8. 8. 6. 9.]]], shape=(2, 3, 5), dtype=float32)


In [16]:
assert tf.experimental.numpy.array_equal(text_embded_manual, text_embed_shortcut1)

## Method 2

Using Keras:

In [17]:
_, SEQ_LEN = word_ids.shape

In [18]:
embeddings_init_fn = tf.keras.initializers.Constant(embedding_table)

emb_layer = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                      output_dim=EMBED_SIZE,
                                      input_length=SEQ_LEN,
                                      embeddings_initializer=embeddings_init_fn,)

text_embed_shortcut2 = emb_layer(word_ids)
print(text_embed_shortcut2)

tf.Tensor(
[[[9. 3. 4. 4. 1.]
  [4. 5. 4. 3. 9.]
  [7. 3. 2. 6. 8.]]

 [[7. 7. 8. 7. 0.]
  [3. 3. 5. 8. 6.]
  [0. 8. 8. 6. 9.]]], shape=(2, 3, 5), dtype=float32)


In [19]:
assert tf.experimental.numpy.array_equal(text_embded_manual, text_embed_shortcut2)