In [1]:
import io
import re
import string
import tensorflow as tf
import tqdm

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import gutenberg
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
def train_tokenizer():
    text = ""
    for file_id in gutenberg.fileids():
        text += gutenberg.raw(file_id)

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(' '.join(text))
    return PunktSentenceTokenizer(trainer.get_params())
    
tokenizer = train_tokenizer()

In [4]:
tokens, sequences = [], []
categories = ['alt.atheism', 'talk.politics.guns', 'sci.space', 'rec.autos']

newsgroups = fetch_20newsgroups(categories=categories)

for newsgroup in newsgroups.data:
    for lines in newsgroup.split('\n'):
        line = re.sub(r"[^a-zA-Z]+", ' ',lines).lower().strip()
        words = line.split()
        if len(words) != 0:
            tokens.extend(words)

    tokenized_sentences = tokenizer.tokenize(newsgroup)    
    for sentence in tokenized_sentences:
        sentence = re.sub(r"[^a-zA-Z]+", ' ',sentence).lower().strip()
        if len(sentence.split()) >= 2:
            sequences.append(sentence)

In [5]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(f'vocab size: {vocab_size}')

vocab size: 30243


In [6]:
sequences[1:3]

['no brent that would be alt sex bondage holly silva goofy anti semite if you were smarter you d have these opinions',
 'from reb hprnd rose hp com ralph bean subject re saturn pricing blatherings article i d']

In [7]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [8]:
text_ds = tf.data.Dataset.from_tensor_slices(sequences)
text_ds

<TensorSliceDataset shapes: (), types: tf.string>

In [9]:
# We create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and number of words in a sequence.
vocab_size = vocab_size
sequence_length = len(sequences)

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [10]:
vectorize_layer.adapt(text_ds.batch(1024))

In [11]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'to', 'of', 'a', 'and', 'in', 'i', 'is', 'that', 'it', 'you', 'for', 'edu', 's', 'from', 'on', 'be', 'this']


In [12]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [13]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

37767


In [14]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)
print(len(targets), len(contexts), len(labels))

100%|██████████| 37767/37767 [03:46<00:00, 166.52it/s]580808 580808 580808



In [15]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [16]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [17]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=3+1)
    self.dots = Dot(axes=(3, 2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    we = self.target_embedding(target)
    ce = self.context_embedding(context)
    dots = self.dots([ce, we])
    return self.flatten(dots)

In [18]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [19]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [20]:
word2vec.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc15906a490>

In [21]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [22]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [36]:
from collections import OrderedDict
from sklearn.metrics.pairwise import cosine_similarity

def closest(word, n):
    vocab_index = vocab.index(word)
    distances = {}
    for i, w in enumerate(weights):
        distances[i] = cosine_similarity([weights[i]], [weights[vocab_index]])
    d_sorted = OrderedDict(sorted(distances.items(), key=lambda x:x[1], reverse=True))
    s_words = [vocab[v] for v in list(d_sorted.keys()) if v < len(vocab)]
    print('{} -> {}\n'.format(s_words[0], ' '.join(s_words[1:n+1])))    

In [37]:
examples = ['politician', 'god', 'technology', 'music']

for keyword in examples:
    closest(keyword, 20)

politician -> continuum conducive cleanly setup competitors adrenalizing inconvinient pointer occurance subcontractor ease hooked delight looker hooligan paramout scramble firm aerostar newbie

god -> undefined uncorruptable vital essence omniscience harmed regretable sceptical sooner suspicion dispite limiting strickly parentage rebuffed companys overpriced bury cruder fable

technology -> aeronautics polytechnical computing alberta meson photography nsw cruz publications geologists catalina planetary readership disciplines electrostatic qic demonstrator interpersonal methyl astronautical

music -> amplified triumf erich scots loud nonfiction beer checkout salesdroid ucis diablo classical topics heavies tri remotest troubled dalhousie flares ridgeview

