<a href="https://colab.research.google.com/github/kristinazk/HomeWorkWeek18/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import numpy as np
import tensorflow as tf

import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import random
from collections import deque
from itertools import chain

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
with open('corpus_100k', 'r', encoding='utf-8', errors='replace') as f:
  sentences = [s.strip() for s in f.readlines()]

print(f'Number of sentences: {len(sentences)}')
use_first_n = 300000
sentences = sentences[:use_first_n]
print(f'Using: {len(sentences)}')

Number of sentences: 1163126
Using: 300000


## Performing initial data clearance, creating a vocabulary and mapping indices to words

In [68]:
def data_preprocessing(data):
  # convert into lowercase
  # remove '.', ',', etc.
  translation_table = str.maketrans('', '', "«»()+-=-,՝.․։՜՛֊՟՚")
  sentences = [(sentence).lower().translate(translation_table) for sentence in data]

  return sentences

data = data_preprocessing(sentences)

def create_vocab(sentences):
  i = 0
  word2idx ={}
  idx2word = {}
  unique = set()
  freqs = {}

  for sentence in sentences:
    for word in sentence.split(' '):
      if (word not in unique) and (len(word.strip()) != 0):
        word2idx[word] = i
        idx2word[i] = word
        i += 1
        unique.add(word)
        freqs[word] = 1
      elif word in unique:
        freqs[word] += 1
  return unique, word2idx, idx2word, freqs

vocab, word2idx, idx2word, freqs = create_vocab(data)

# Altering the distribution to perform negative sampling
totalWords = sum([freq**(3/4) for freq in freqs.values()])
wordProb = {word:(freq)**(3/4)/totalWords for word, freq in freqs.items()}

In [69]:
def skipgram_data_generator(sentences, window_size, batch_size, vocab):
    for sentence in sentences:
        # Pad the sentence with None values at the beginning and end
        padded_sentence = [None] * (window_size) + sentence + [None] * (window_size)
        # Use a deque to efficiently slide the window over the sentence
        window = deque(maxlen=2 * window_size + 1)
        for word in padded_sentence:
            if len(window) == 2 * window_size + 1:
                # Generate positive word pairs (target, context)
                target_word = window[window_size]
                context_words = list(chain(list(window)[:window_size], list(window)[window_size + 1:]))
                for context_word in context_words:
                    if target_word in vocab and context_word in vocab:
                        yield target_word, context_word


            window.append(word)

        # Check if the current sentence is completed
        if None in window:
            yield None, None  # Signal the end of the current sentence


data = [sentence.split() for sentence in data ]

window_size = 2
batch_size = 32


batches = []
current_batch = []

for target, context in skipgram_data_generator(data[:10], window_size, batch_size, vocab):
    if target is not None and context is not None:
        current_batch.append((target, context))
    if len(current_batch) == batch_size:
        batches.append(current_batch)
        current_batch = []

# Handle the last batch if it's not complete
if current_batch:
    batches.append(current_batch)


values = list(wordProb.keys())
probabilities = list(wordProb.values())


neg_samples = []

num_neg_samples = 10

targets = []
contexts = []
labels = []

# Print the batches
for i, batch in enumerate(batches):
    for target, context in batch:
      context_arr = []
      targets.append(word2idx[target])
      context_arr.append(word2idx[context])
      i = 0
      while(i < num_neg_samples):
          word = random.choices(values, weights=probabilities, k=1)[0]
          if word != target and word != context:
            context_arr.append(word2idx[word])
            i += 1

      label_arr = np.zeros(len(context_arr), dtype=int)
      label_arr[0] = 1
      contexts.append(context_arr)
      labels.append(list(label_arr))


## Defining and Training the model

In [72]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim=200):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.vocab_size = vocab_size
    self.hidden_weights = tf.random.uniform(
    shape=(self.vocab_size, embedding_dim),
    minval=-0.5 / embedding_dim,
    maxval=0.5 / embedding_dim,
)
    self.output_weights = tf.random.uniform(
    shape=(embedding_dim, self.vocab_size),
    minval=-0.5 / embedding_dim,
    maxval=0.5 / embedding_dim,
)

  def forward(self, data):
    hidden_layer = tf.transpose(self.hidden_weights) @ data
    output_init = tf.transpose(self.output_weights) @ hidden_layer

    y = self.softmax(output_init)

    return y, hidden_layer, output_init

  def training(self, targ, cont, lab, num_iter=10000): # loss value does not go down
    for _ in range(num_iter):
      for i, word_idx in enumerate(targ):
        cont_pos = cont[i][0]
        context_indices_neg = cont[i][1:]
        with tf.GradientTape() as tape:
          loss = tf.cast(-tf.math.log(self.sigmoid(tf.reduce_sum(tf.multiply(self.output_weights[:, word_idx], self.hidden_weights[cont_pos])))), tf.float32) - tf.cast(tf.reduce_sum(tf.math.log(self.sigmoid(-tf.Variable([tf.reduce_sum(tf.multiply(self.output_weights[:, idx], self.hidden_weights[cont_pos])) for idx in context_indices_neg])))), tf.float32)

        gradients = tape.gradient(loss, self.trainable_variables)

        optimizer = tf.optimizers.Adam(learning_rate=1)

        # Update weights using the optimizer
        optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        print(loss)

  # Obtain the embeddings easily
  def __call__(self):
    return self.hidden_weights

  def onehot(self, idx):
    output = np.zeros(self.vocab_size)
    output[idx] = 1
    return output

  @staticmethod
  def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

  @staticmethod
  def sigmoid(x):
    return 1/(1 + np.exp(-x))


In [None]:
word2vec = Word2Vec(len(vocab))

word2vec.training(targets, contexts, labels)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tf.Tensor(7.6246505, shape=(), dtype=float32)
tf.Tensor(7.6245923, shape=(), dtype=float32)
tf.Tensor(7.624596, shape=(), dtype=float32)
tf.Tensor(7.6245723, shape=(), dtype=float32)
tf.Tensor(7.6246176, shape=(), dtype=float32)
tf.Tensor(7.624647, shape=(), dtype=float32)
tf.Tensor(7.6246796, shape=(), dtype=float32)
tf.Tensor(7.624647, shape=(), dtype=float32)
tf.Tensor(7.6246476, shape=(), dtype=float32)
tf.Tensor(7.6246977, shape=(), dtype=float32)
tf.Tensor(7.624637, shape=(), dtype=float32)
tf.Tensor(7.6245923, shape=(), dtype=float32)
tf.Tensor(7.6246405, shape=(), dtype=float32)
tf.Tensor(7.6246514, shape=(), dtype=float32)
tf.Tensor(7.624579, shape=(), dtype=float32)
tf.Tensor(7.624552, shape=(), dtype=float32)
tf.Tensor(7.624625, shape=(), dtype=float32)
tf.Tensor(7.6246295, shape=(), dtype=float32)
tf.Tensor(7.6246505, shape=(), dtype=float32)
tf.Tensor(7.6245604, shape=(), dtype=float32)
tf.Tensor(7.624593, sh