## Word2Vec

### table of contents
* Skip-gram and Negative Sampling
* Setup
  * Vectorize an example sentence
  * Generate skip-grams from one sentence
  * Negative sampling for one skip-gram

In [1]:
# two methods for learning representations of words
# 1. Continuous Bag-of Words Model
# 2. Continuous Skip-gram Model

In [3]:
import io
import re
import string
import tensorflow as tf
import tqdm

In [4]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [5]:
%load_ext tensorboard

In [7]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [8]:
sentence = 'The wild road shimmered in the hot sun'
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [11]:
vocab, index = {}, 1
vocab['<pad>'] = 0

for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wild': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [13]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wild', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [14]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


### Generate  skip-grams from one sentence

In [15]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
    example_sequence,
    vocabulary_size=vocab_size,
    window_size=window_size,
    negative_samples=0
)
print(len(positive_skip_grams))

26


In [16]:
# take a look at few positive skip-grams
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(5, 3): (in, road)
(6, 7): (hot, sun)
(2, 1): (wild, the)
(7, 6): (sun, hot)
(4, 3): (shimmered, road)


In [17]:
target_word, context_word = positive_skip_grams[0]
target_word, context_word

(5, 3)