### Imports

In [1]:
import io
import re
import string
import tqdm

import numpy as np
import tensorflow as tf

from tensorflow.keras import layers
from Word2Vec import Word2Vec

2021-10-07 18:10:06.571188: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-07 18:10:06.571206: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Constants

In [2]:

PATH_FILE_OUT_VECTORS = './file/out_vectors.tsv'
PATH_FILE_OUT_VOCAB = './file/out_vocabulary.tsv'
PATH_FILE_IN_TXT = './file/in_shakespeare.txt'
PATH_DIR_LOGS = './logs'

LAYER_W2V_EMBED = 'layer_w2v_target_embed'

SEED = 42   # Seed for reproducibility on pseudo randomization

SIZE_VOCAB = 4096       # Max count of words
SIZE_EPOCHS = 20        # ...
SIZE_WINDOW = 2         # Window size: How many words before & after the targets will be in contexts
SIZE_SENTENCE = 10      # Normalized count of word per each sentence
SIZE_NEG_SAMPLES = 4    # Number of negative samples for each context of each sentence of training data

'''
    TODO: 2021-10-06 - Understand the meaning of these following constants
'''

SIZE_BUF = 10000        # ?? Which buffer? What is it for?
SIZE_BATCH = 1024       # ??

AUTOTUNE = tf.data.AUTOTUNE # ??
EMBED_DIMENSION = 128       # ??

# Generate Training Data

    A tuple (target, context, label) tensors constitutes one training example
_(In skip-gram negative sampling Word2Vec model)_

### Read File

In [3]:
with open(PATH_FILE_IN_TXT) as f: 
  lines = f.read().splitlines()
text_ds = tf.data.TextLineDataset(PATH_FILE_IN_TXT).filter(lambda x: tf.cast(tf.strings.length(x), bool))

2021-10-07 18:10:07.620181: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-07 18:10:07.620206: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-07 18:10:07.620238: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (hjpc): /proc/driver/nvidia/version does not exist
2021-10-07 18:10:07.620499: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Vectorize sentences

- `TextVectorization` layer normalize, split, and map strings to integers;
- `adapt()` update the layer state to represent the text corpus;
- After that vocabulary can be accessed with `get_vocabulary()` _(This function returns a list of all vocabulary tokens sorted (descending) by their frequency)_;        
- After that, `vectorize_layer` can be used to generate a `tf.data.Dataset` of integer encoded sentences;
- `text_vector_ds` is a `tf.data.Dataset` with vectors for each element in `text_ds` Dataset _(lines of the file)_;
- From `text_vector_ds` we generate `sentences`;
- `sentences` is the dataset flattened as a list of int encoded sentence vectors _(It's required as we'd iterate over each sentence in the DS to produce positive & negative examples)_;

In [4]:

# Map strings to integers
def standardize_token(input_data: str) -> str:
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = layers.TextVectorization(
    standardize=standardize_token,
    max_tokens=SIZE_VOCAB,
    output_mode='int',
    output_sequence_length=SIZE_SENTENCE  # Pad all samples to same length
  )

# Create vocabulary
vectorize_layer.adapt(text_ds.batch(SIZE_BATCH)) 
inverse_vocab = vectorize_layer.get_vocabulary()

# Generate list of int sequence vectors of sentenses
text_vector_ds = text_ds.batch(SIZE_BATCH).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
sentences = list(text_vector_ds.as_numpy_iterator())

2021-10-07 18:10:07.827637: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


## Generate data

### Main Function: _Definition_

In [5]:
def generate_training_data(sentences: list, window_size: int, num_ns: int, vocab_size: int, seed: int):
  '''
    Generate training examples for the Word2Vec model:
    
    - Generates skip-gram pairs with negative sampling for a list of sentences;
    - Iterates over each word of each sentence to collect positive and negative context words;
    - Length of target, contexts and labels should be same, representing the total number of training examples;

    Parameters
    - sentences: List of int-encoded sentences;
    - window_size: Size of the context (How many words before and after targets considered as context);
    - num_ns: Number of negative samples per each context;
    - vocab_size: How many tokens (generally words) are mapped;
    - seed: Seed for reproducibility on pseudo randomization;

    Return
    - Batch 1 positive `context_word` and `num_ns` negative context words into one tensor;
    - This produces a set of positive skip-grams _(labelled as 1)_ & negative samples _(labelled as 0)_ for each target word;
    - Notice that the target is of shape (1,) while the context and label are of shape (1 + `num_ns`,)
  '''

  targets, contexts, labels = [], [], []
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size) # Word frequency rank of 'vocab_size' tokens

  for sequence in tqdm.tqdm(sentences):

    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
        sequence,
        vocabulary_size=vocab_size,
        sampling_table=sampling_table,
        window_size=window_size,
        negative_samples=0
    )

    # For each positive skip-gram pair produce training examples with positive context word & negative samples
    for target_word, context_word in positive_skip_grams:
      
      context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          name="negative_sampling",   # Name of this operation
          true_classes=context_class, # Class that should be sampled as 'positive'
          range_max=vocab_size,       # Pick index of the samples from [0, vocab_size]
          num_sampled=num_ns,         # Number of negative context words to sample
          unique=True,                # All the negative samples should be unique
          num_true=1,                 # Each positive skip-gram has 01 positive context class
          seed=seed,                  # Seed for reproducibility
      )

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)
      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0] * num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

### Main Function: _Execution_

In [6]:
targets, contexts, labels = generate_training_data(
    sentences=sentences,
    window_size=SIZE_WINDOW,
    num_ns=SIZE_NEG_SAMPLES,
    vocab_size=SIZE_VOCAB,
    seed=SEED,
)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

100%|██████████| 32777/32777 [00:07<00:00, 4413.91it/s]


## Performance tweaks

- For performance reasons we go back with our data into the `tf.data.Dataset` format;
- It's better to perform efficient batching for the potentially large number of training examples;
- `cache()` and `prefetch()` improve performance _(How ??)_;

In [7]:
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(SIZE_BUF).batch(SIZE_BATCH, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

# Modeling & Training

### Description

- The Word2Vec model can be implemented as a __classifier__;
- It distinguishes between true context words _(from skip-grams)_ and false context words _(negative sampling)_;
- You can perform a dot product between the embeddings of target and context words to:
    - Obtain predictions for labels and;
    - To compute loss against true labels in the dataset;

### Create Model

In [8]:
word2vec = Word2Vec(SIZE_VOCAB, EMBED_DIMENSION, SIZE_NEG_SAMPLES, LAYER_W2V_EMBED)
word2vec.compile(
    optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

### Train

In [9]:
# Train with dataset prepared above
word2vec.fit(dataset, epochs=SIZE_EPOCHS)
weights = word2vec.get_layer(LAYER_W2V_EMBED).get_weights()[0]
vocab = vectorize_layer.get_vocabulary()    # Vocabulary to build metadata file with 01 token per line

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Write results into files

In [10]:
out_vectors = io.open(PATH_FILE_OUT_VECTORS, 'w', encoding='utf-8')
out_vocabulary = io.open(PATH_FILE_OUT_VOCAB, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.

  vec = weights[index]
  out_vectors.write('\t'.join([str(x) for x in vec]) + "\n")
  out_vocabulary.write(word + "\n")

out_vectors.close()
out_vocabulary.close()