In [None]:
import io
import re
import string
import tensorflow as tf
import tqdm

In [None]:
%load_ext tensorboard

In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
tokens

In [None]:
len(tokens)

In [None]:
vocab, index = {}, 1
vocab['<pad>'] = 0

for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1

vocab_size = len(vocab)
print(vocab)

In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

In [None]:
example_sequence = [vocab[word] for word in tokens]
example_sequence

In [None]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(example_sequence, vocab_size, window_size, 0)
print(len(positive_skip_grams))

In [None]:
positive_skip_grams

In [None]:
for target, contex in positive_skip_grams[:]:
    print("({}, {}): ({}, {})".format(target, contex, inverse_vocab[target], inverse_vocab[contex]))

In [None]:
target_word, context_word = positive_skip_grams[0]

num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype='int64'), (1, 1))
context_class

In [None]:
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,
    num_true=1,
    num_sampled=num_ns,
    unique=True,
    range_max=vocab_size,
    seed=SEED,
    name='negative_sampling'
)

print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

In [None]:
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)
negative_sampling_candidates

In [None]:
context = tf.concat([context_class, negative_sampling_candidates], 0)
context

In [None]:
label = tf.constant([1] + [0]*num_ns, dtype='int64')
label

In [None]:
target_word

In [None]:
target = tf.squeeze(target_word)
target

In [None]:
context

In [None]:
tf.squeeze(context)

In [None]:
context = tf.squeeze(context)

In [None]:
label

In [None]:
label = tf.squeeze(label)
label

In [None]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

In [None]:
print("target: {}".format(target))
print("context: {}".format(context))
print("label: {}".format(label))

In [None]:
tf.keras.preprocessing.sequence.make_sampling_table(size=10)

In [None]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    targets, contexts, labels = [], [], []
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    for sequence in tqdm.tqdm(sequences):
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocab_size,
            window_size,
            0,
            sampling_table=sampling_table
        )

        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype='int64'), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=SEED,
                name="negative_sampling"
            )

            negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)
            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype='int64')

            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
        
    return targets, contexts, labels

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
with open(path_to_file) as f:
    lines = f.read().splitlines()
for line in lines[:20]:
    print(line)

In [None]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

vocab_size = 4096
sequence_length = 10

vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_sequence_length= sequence_length,
)

In [None]:
vectorize_layer.adapt(text_ds.batch(1024))

In [None]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

In [None]:
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

In [None]:
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

In [None]:
targets, contexts, labels = generate_training_data(
    sequences,
    2,
    4,
    vocab_size,
    SEED
)

print(len(targets), len(contexts), len(labels))

In [None]:
print(targets[:5])
print(contexts[:5])
print(labels[:5])

In [None]:
BATCH_SIZE=1024
BUFFER_SIZE=10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print(dataset)

In [None]:
dataset = dataset.cache().prefetch(AUTOTUNE)
print(dataset)

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()

        self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_ns+1)
        self.dots = tf.keras.layers.Dot(axes=(3,2))
        self.flatten = tf.keras.layers.Flatten()
    
    def call(self, pair):
        target, context = pair
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = self.dots([context_emb, word_emb])
        return self.flatten(dots)

In [None]:
def custom_loss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(y_true, x_logit)

In [None]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile('adam', loss=tf.keras.losses.CategoricalCrossentropy(True), metrics=['accuracy'])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [None]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index==0:
        continue
    
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
    out_m.write(word+'\n')

out_v.close()
out_m.close()

In [None]:
try:
    from google.colab import files
    files.download('vectors.tsv')
    files.download('metadata.tsv')
except Exception:
    pass