In [1]:
import io
import itertools
import numpy as np
import os
import re
import string
import tqdm

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
seed = 42
AUTOTUNE = tf.data.experimental.AUTOTUNE

# One sentence

In [3]:
sentence = 'The wide road shimmered in the hot sun'

In [4]:
# tokenize it
tokens = list(sentence.lower().split())
len(tokens)

8

In [5]:
# create a vocab to save mappings from tokens to integers
vocab, index = {}, 1 # start indexing from 1
vocab['<pad>'] = 0 # add a padding token
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [6]:
# create an inverse vocab to save mappings from integers to tokens
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [7]:
# vectorize the sentence using vocab
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


### generate skip-grams

In [8]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
    example_sequence,
    vocabulary_size=vocab_size,
    window_size=window_size,
    negative_samples=0 # no negative samples for now, will use in the next section
)
len(positive_skip_grams)

26

In [9]:
# check some skip-grams
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(1, 2): (the, wide)
(4, 5): (shimmered, in)
(2, 1): (wide, the)
(1, 4): (the, shimmered)
(4, 3): (shimmered, road)


### negative skip-grams

In [10]:
# also need to generate negative samples by randomly sampling words in the sentence
# for each target word, exclude its true context word from being sampled

# get target and context words for one positive skip-gram
target_word, context_word = positive_skip_grams[0]

# number of negative samples per positive context, [5, 20] is best for small datasets, while [2, 5] is for larget datasets
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype='int64'), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class, # class that should be sampled as positive
    num_true=1, # each positive skip-gram has 1 positive context class
    num_sampled=num_ns, # number of negative context words per sample
    unique=True, # all negative samples should be unique
    range_max=vocab_size, # pick samples from [0, vocab_size]
    seed=seed,
    name='negative_sampling'
)

print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


### construct one training example

In [11]:
# add a dimnesion so can be concatenated
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# concat pos and neg samples
context = tf.concat([context_class, negative_sampling_candidates], 0)

# label first context word as positive, and rest negative
label = tf.constant([1] + [0] * num_ns, dtype='int64')

# reshape target to (1, ) and context and label to (num_ns+1, )
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [12]:
# check the sample
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 1
target_word     : the
context_indices : [2 2 1 4 3]
context_words   : ['wide', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [13]:
# a tuple of (target, context, label) tensor is one training example
print(f"target  :", target)
print(f"context :", context )
print(f"label   :", label )

target  : tf.Tensor(1, shape=(), dtype=int32)
context : tf.Tensor([2 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


# Compile all steps in one function

In [14]:
# a sampling table can be used to not sampling frequent words (e.g. the) too much
# it is the probability of sampling the ith most common word in the dataset
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [15]:
# generate skip-grams with negative sampling for a list of sequences (int-encoded sentences)
# based on window size, number of negative samples, and vocab size
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # elements of each training sample are appended to these lists
    targets, contexts, labels = [], [], []

    # build sampling table
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    for sequence in tqdm.tqdm(sequences):
        # generate postive skip-grams
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0
        )

        # iterate the positive skip-grams to generate negative training samples
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype='int64'), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name='negative_sampling'
            )

            # build context and label vectors for one target word
            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0] * num_ns, dtype='int64')

            targets.append(target_word)
            contexts.append(context),
            labels.append(label)

    return targets, contexts, labels

# Prepare training data for Word2Vec

### get data

In [16]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [17]:
# check some data
with open(path_to_file) as f:
    lines = f.read().splitlines()
for line in lines[:20]:
    print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [18]:
# remove empty lines and build dataset
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

### vectorize sentences

In [19]:
# custom standardization function to lowercase and remove punctuation
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

# vocab size and number of words in a sequence
vocab_size = 4096
sequence_length = 10

# use TextVectorization layer to normalize, split, and map strings to integers
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length # pad all samples to same length
)

In [20]:
# adapt on the training set to create vocabulary
vectorize_layer.adapt(text_ds.batch(1024))

In [21]:
# check the vocabulary
inverse_vocab = vectorize_layer.get_vocabulary()
# order is based on frequency
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [22]:
# vectorize the dataset

def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return tf.squeeze(vectorize_layer(text))

text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

### obtain sequences from dataset

In [23]:
# flatten the dataset into a list of sentence vector sequences
sequences = list(text_vector_ds.as_numpy_iterator())
len(sequences)

32777

In [24]:
# check some sequences
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


### generate training example from sequences

In [25]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=seed
)

print(len(targets), len(contexts), len(labels))

100%|██████████| 32777/32777 [00:27<00:00, 1200.76it/s]64866 64866 64866



In [26]:
# configure dataset
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

# create dataset of (target_word, context_word), (label)
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [27]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


# Build model

In [28]:
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()

        # target embedding layer that looks up embedding of a word when it is a target word
        self.target_embedding = Embedding(
            vocab_size,
            embedding_dim,
            input_length=1,
            name='w2v_embedding'
        )
        # context embedding layer that looks up embedding of a word when it is a context word
        self.context_embedding = Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_ns+1
        )
        # dot product to obtain predictions for labels
        self.dots = Dot(axes=(3,2))
        # flatten to logits
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

In [29]:
# compile model
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [30]:
# callback for tensorboard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [31]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1862192d6a0>

# Save word embedding

In [33]:
# get embedding weights
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

# get vocab
vocab = vectorize_layer.get_vocabulary()

In [34]:
# save weights to disk
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue # skip 0, it is padding
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
    out_m.write(word + '\n')
out_v.close()
out_m.close()