In [2]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

# Load the TensorBoard notebook extension
%load_ext tensorboard

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
def generate_cbow_batches(sequences, window_size, num_ns, vocab_size, seed):
    """
    Generates batches of data for a CBOW model using negative sampling.
    
    Arguments:
        sequences: Tensorflow sequence data in the form of a list of lists, where each inner list is a sequence of words.
        window_size: An integer indicating the window size of the CBOW model.
        num_ns: An integer indicating the number of negative samples to use for each positive sample.
        vocab_size: An integer indicating the size of the vocabulary.
        seed: A random seed used for reproducibility.
    
    Returns:
        targets: A numpy array of shape (batch_size,) containing target words.
        contexts: A numpy array of shape (batch_size, window_size*2) containing context words.
        labels: A numpy array of shape (batch_size,) containing binary labels (1 for positive, 0 for negative).
    """
    np.random.seed(seed)
    
    # Flatten the sequence data into a single list of words.
    all_words = [word for sequence in sequences for word in sequence]
    
    # Create a dictionary to map words to their indices in the vocabulary.
    word2idx = {word: idx for idx, word in enumerate(set(all_words))}
    
    # Create a list of word indices for each sequence in the input data.
    sequence_idxs = [[word2idx[word] for word in sequence] for sequence in sequences]
    
    # Create a list of (target, context) pairs for each word in each sequence.
    pairs = []
    for sequence in sequence_idxs:
        for i, target in enumerate(sequence):
            for j in range(max(0, i-window_size), min(len(sequence), i+window_size+1)):
                if i != j:
                    pairs.append((target, sequence[j]))
    
    # Create a numpy array of target words and a corresponding numpy array of context words.
    targets = np.array([pair[0] for pair in pairs], dtype=np.int32)
    contexts = np.array([[pair[1]] for pair in pairs], dtype=np.int32)
    
    # Create a list of all word frequencies.
    word_freqs = np.array([all_words.count(word) for word in word2idx.keys()], dtype=np.float32)
    
    # Compute the unigram distribution of words for use in negative sampling.
    unigram_dist = word_freqs / np.sum(word_freqs)
    
    # Generate negative samples for each positive sample.
    neg_samples = np.random.choice(vocab_size, size=(len(targets), num_ns), replace=True, p=unigram_dist)
    
    # Concatenate the positive and negative samples to create a single array of context words.
    contexts = np.concatenate([contexts, neg_samples], axis=1)
    
    # Create a numpy array of labels for each context word (1 for positive, 0 for negative).
    labels = np.zeros_like(contexts, dtype=np.int32)
    labels[:, 0] = 1
    
    return targets, contexts, labels


In [5]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):

        # Generate positive CBOW examples for a sequence (sentence).
        target_idx = window_size
        while target_idx < len(sequence) - window_size:
            target_word = sequence[target_idx]

            # Get context words for the target word.
            context_words = []
            for i in range(-window_size, window_size + 1):
                if i == 0:
                    continue
                context_idx = target_idx + i
                if context_idx < 0 or context_idx >= len(sequence):
                    continue
                context_words.append(sequence[context_idx])

            # Generate negative samples for the target word.
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=tf.expand_dims(tf.constant([target_word], dtype="int64"), 1),
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context_words + negative_sampling_candidates.numpy().tolist())
            labels.append([1] * len(context_words) + [0] * num_ns)

            target_idx += 1

    return targets, contexts, labels


In [None]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):
        print("sequence", sequence)

        # Generate positive CBOW examples for a sequence (sentence).
        target_idx = window_size
        while target_idx < len(sequence) - window_size:
            target_word = sequence[target_idx]
            print("target idx and target word", target_idx, target_word)
            
            # skip the padding words
            if target_word == 0:
                target_idx += 1
                continue

            # Get context words for the target word.
            context_words = []
            for i in range(-window_size, window_size + 1):
                if i == 0:
                    continue
                context_idx = target_idx + i
                if context_idx < 0 or context_idx >= len(sequence):
                    continue
                context_words.append(sequence[context_idx])
            print("target word and context", target_word, context_words)

            
            
            # Generate negative samples for the target word.
            # TODO: This doesn't actually use the num_ns value, we add one neg sample per target word 
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=tf.expand_dims(tf.constant([target_word], dtype="int64"), 1),
                num_true=1,
                num_sampled=window_size * 2,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            print("sampling", negative_sampling_candidates)
            # Append each element from the training example to global lists.
            # Add the positive sample
            targets.append(target_word)
            contexts.append(context_words)
            labels.append([1])
            
            # add the negative sample
            targets.append(target_word)
            contexts.append(negative_sampling_candidates.numpy().tolist())
            labels.append([0])
            print("targets", targets)
            print("contexts", contexts)
            print("labels", labels)
            

            target_idx += 1

    return targets, contexts, labels




In [6]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

with open(path_to_file) as f:
  lines = f.read().splitlines()

for line in lines[:20]:
  print(line)


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [7]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))


Metal device set to: Apple M1


2023-03-21 16:19:44.683605: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-21 16:19:44.684222: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

In [9]:
# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(text_ds.batch(1024))

2023-03-21 16:19:49.718262: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-21 16:19:49.764413: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [10]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [11]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [23]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))
print(sequences)

32777
[array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([138,  36, 982, 144, 673, 125,  16, 106,   0,   0]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([106, 106,   0,   0,   0,   0,   0,   0,   0,   0]), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([   7,   41,   34, 1286,  344,    4,  200,   64,    4, 3690]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([1286, 1286,    0,    0,    0,    0,    0,    0,    0,    0]), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([  89,    7,   93, 1187,  225,   12, 2442,  592,    4,    2]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([  36, 2655,   36, 2655,    0,    0,    0,    0,    0,    0]), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([  72,   79,  506,   27,    3,   56,   24, 1390,   57,   40]), array([644,   9,   1,   0,   0,   0,   0,   0,   0,   0]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([  32,   54, 2863,  885,   72,   

In [12]:
targets, contexts, labels = generate_cbow_batches(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

KeyboardInterrupt: 

In [13]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 32777/32777 [01:31<00:00, 359.89it/s]



targets.shape: (196662,)
contexts.shape: (196662, 8)
labels.shape: (196662, 8)





In [14]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [15]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [16]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 8), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 8), dtype=tf.int64, name=None))>


In [17]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 8), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 8), dtype=tf.int64, name=None))>


In [18]:
num_ns = 4

class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [19]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [20]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [21]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [22]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
  1/192 [..............................] - ETA: 1:28 - loss: 8.3204 - accuracy: 0.1123

2023-03-21 16:21:46.807569: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15dbafeb0>

In [46]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [47]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [30]:
# down here is an attempt without negative sampling, pretty slow too
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

# Download the Shakespeare text dataset
url = 'https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt'
urllib.request.urlretrieve(url, 'shakespeare.txt')

# Load the dataset into memory
with open('shakespeare.txt', 'r') as f:
    data = f.read()

# Tokenize the text into sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
sequences = tokenizer.texts_to_sequences([data])[0]
sequences = sequences[:40]

In [31]:
sequences

[20,
 11,
 1,
 16360,
 425,
 3471,
 3723,
 30,
 417,
 446,
 2,
 11,
 3723,
 10,
 12583,
 14,
 131,
 452,
 473,
 49,
 67,
 452,
 5,
 1,
 3879,
 2,
 377,
 16361,
 417,
 446,
 842,
 16362,
 6155,
 9,
 33,
 12,
 5118,
 10,
 1,
 1599]

In [None]:

# Create a list of unique words in the dataset
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Generate training data using a sliding window approach
WINDOW_SIZE = 2
train_inputs = []
train_labels = []
for i in range(WINDOW_SIZE, len(sequences) - WINDOW_SIZE):
    train_inputs.append(sequences[i - WINDOW_SIZE:i] + sequences[i+1:i+WINDOW_SIZE+1])
    train_labels.append(sequences[i])

print(train_inputs, train_labels)
    
# Pad sequences to have the same length
MAX_SEQ_LENGTH = 4
train_inputs = pad_sequences(train_inputs, maxlen=MAX_SEQ_LENGTH, padding='pre')
train_labels = np.array(train_labels)

# Define the model
EMBEDDING_DIM = 300
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Train the model
NUM_EPOCHS = 20
BATCH_SIZE = 512
history = model.fit(train_inputs, train_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)



In [None]:
# Save the model
model.save('shakespeare_word2vec_cbow.h5')