In [21]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [22]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [23]:
import tqdm

# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [24]:
path_to_file ='orders.txt'

In [25]:
with open(path_to_file) as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)


49302 11109 10246 49683 43633 13176 47209 22035
39612 19660 49235 43086 46620 34497 48679 46979
11913 18159 4461 21616 23622 32433 28842 42625 39693
20574 30391 40706 25610 27966 24489 39275
8859 19731 43654 13176 4357 37664 34065 35951 43560 9896 27509 15455 27966 47601 40396 35042 40986 1939 46313 329 30776 36695 27683 15995 27344 47333 48287 45204 24964 18117 46413 34126 9373 22935 46720 44479 790 18441 45007 20520 7461 26317 3880 36364 32463 41387 31066 17747 25659
27104 21174 41860 38273 47209 5876 29217 9047 4549 22425 11776
18394 37766 13176 6236 5077 8153 43772 25591 34582 49593 15093 43841 21137 40354 17794 11182 39190
1194 5578 38159 10305 38557
28199 24852 29883 28427 7754 39947 47307 36291 39275 1940 2040 20711 47501
33000 11361 27695 47672 45633 38015 36968 30830 5115 11520 25715
12078 6184 32403 19828 12341 16797 13424 30591
18196 34229 48118 14992 35365 31506 4972 34591 47626
20082 24852 47144 36441 12206 4034 30573 42404
21137 27344 42265 12211 29740 31717 13829 34358 4

In [26]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [27]:
from keras import layers

# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 10000
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [28]:
vectorize_layer.adapt(text_ds.batch(1024))

In [29]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', '24852', '13176', '21137', '21903', '47626', '47766', '47209', '16797', '26209', '27966', '39275', '27845', '30391', '45007', '22935', '24964', '4920', '46979']


In [30]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [31]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

124364


In [32]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[   1 1696  138   48 7858    3    8   64    0    0] => ['[UNK]', '11109', '10246', '49683', '43633', '13176', '47209', '22035', '', '']
[6231   60   41  265 5293 1763   42   19    0    0] => ['39612', '19660', '49235', '43086', '46620', '34497', '48679', '46979', '', '']
[2513 2407  486   32  602  376   68  254 1945    0] => ['11913', '18159', '4461', '21616', '23622', '32433', '28842', '42625', '39693', '']
[ 662   14   20 4575   11  116   12    0    0    0] => ['20574', '30391', '40706', '25610', '27966', '24489', '39275', '', '', '']
[ 165 2232 1429    3 1687 2581 6109   58    1 7033] => ['8859', '19731', '43654', '13176', '4357', '37664', '34065', '35951', '[UNK]', '9896']


In [33]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=42)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

 23%|██▎       | 28404/124364 [00:51<02:52, 556.45it/s]


KeyboardInterrupt: 

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [None]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [None]:
embedding_dim = 100

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
word2vec.fit(dataset, epochs=30, callbacks=[tensorboard_callback])

Epoch 1/30


[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 67ms/step - accuracy: 0.2734 - loss: 1.5578
Epoch 2/30
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 67ms/step - accuracy: 0.4787 - loss: 1.2914
Epoch 3/30
[1m1026/1026[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 66ms/step - accuracy: 0.5757 - loss: 1.0975
Epoch 4/30
[1m 303/1026[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m45s[0m 62ms/step - accuracy: 0.6352 - loss: 0.9753

KeyboardInterrupt: 

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
embs_arr = word2vec.target_embedding.get_weights()[0]

In [None]:
import numpy as np
from scipy.spatial import distance

class NearestNeighbor:
    def __init__(self, embeddings, measure='cosine'):
        self.embeddings = embeddings
        self.measure = measure

    def find_nearest_neighbors(self, vector, k=2):
        if self.measure == 'cosine':
            # Compute cosine distances between the vector and all other vectors in the embeddings
            distances = distance.cdist([vector], self.embeddings, 'cosine')[0]
        else:
            # Euclidean distances can also be used
            distances = distance.cdist([vector], self.embeddings, 'euclidean')[0]
        
        # Get the indices of the smallest k distances
        nearest_indices = np.argsort(distances)[:k]
        return nearest_indices


In [None]:
# Example to find and display similar items
nn = NearestNeighbor(embs_arr, 'cosine')

# Find the top k most similar items to the first item in the embeddings
k = 5
first_vector = embs_arr[0]
nearest_indices = nn.find_nearest_neighbors(first_vector, k)

# Display the most similar items
print(f"Most similar items to '{vocab[0]}'")
for i, idx in enumerate(nearest_indices):
    print(f"{i+1}: {vocab[idx]}")

# Find the top k most similar items to the last item in the embeddings
last_vector = embs_arr[-1]
nearest_indices = nn.find_nearest_neighbors(last_vector, k)

# Display the most similar items
print(f"\nMost similar items to '{vocab[-1]}'")
