In [66]:
import numpy as np
import pandas as pd
import tqdm
import jax
import jax.numpy as jnp
import string
import tensorflow as tf
import time
from tqdm import tqdm

In [55]:
data = pd.read_csv('./data/raw_data.csv', header=0, names=['text'], usecols=[1])
print(f'Data Shape: {data.shape}')
data.head()

Data Shape: (13368, 1)


Unnamed: 0,text
0,"Sally Forrest, an actress-dancer who graced th..."
1,A middle-school teacher in China has inked hun...
2,A man convicted of killing the father and sist...
3,Avid rugby fan Prince Harry could barely watch...
4,A Triple M Radio producer has been inundated w...


In [56]:
# remove punctuation
punctuations = string.punctuation
def remove_punctuation(txt):
    for char in punctuations:
        if char in txt:
            txt = txt.replace(char, "")
    return txt

# change to lower caps
data['text'] = data['text'].str.lower()

# remove punctuations
data['text'] = data['text'].apply(remove_punctuation)

In [57]:
# split each row into list of words
data_lst = data['text'].apply(lambda txt: txt.split(" "))

# select number of rows to be used as training data
nrows = 200
random_indices = np.random.randint(low=0, high=len(data_lst), size=nrows)
data_lst = data_lst[random_indices].reset_index(drop=True)

print(f'Number of rows of data: {len(data_lst)}')
data_lst[:5]

Number of rows of data: 200


0    [in, a, video, recorded, just, before, he, att...
1    [the, nfl, will, gear, up, for, the, landmark,...
2    [the, names, of, two, missouri, police, comman...
3    [when, kelly, lynn, miller, appeared, at, a, p...
4    [neymar, has, been, preparing, for, the, bigge...
Name: text, dtype: object

In [70]:
# vocab dict
vocab, index = {}, 1
vocab['<pad>'] = 0
for line in data_lst:
    for word in line:
        if word not in vocab:
            vocab[word] = index
            index += 1

# inverse_vocab dict
inverse_vocab = {}
for word, index in vocab.items():
    inverse_vocab[index] = word

print(f'Vocab size: {len(vocab)}')

Vocab size: 15206


In [71]:
# sequences
sequences = []
for line in data_lst:
    vectorized_line = [vocab[word] for word in line]
    sequences.append(vectorized_line)

In [17]:
# create function that generates the skip gram samples
# def generate_training_data(sequences, window_size):
#     skip_grams = []
#     # for each sentence
#     for sequence in sequences:
#         for center_word_pos in range(len(sequence)):
#             # for each window position
#             for w in range(-window_size, window_size + 1):
#                 context_word_pos = center_word_pos + w
#                 # make soure not jump out sentence
#                 if context_word_pos < 0 or context_word_pos >= len(sequence) or center_word_pos == context_word_pos:
#                     continue
#                 context_word_idx = sequence[context_word_pos]
#                 skip_grams.append((sequence[center_word_pos], context_word_idx))

#     skip_grams = np.array(skip_grams) # it will be useful to have this as numpy array
#     return skip_grams

In [21]:
# skip_grams = generate_training_data(sequences, 5)
# for target, context in skip_grams[:5]:
#     print(f'({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})')

(0, 1): (staff, at)
(0, 2): (staff, a)
(0, 3): (staff, piggery)
(0, 4): (staff, have)
(0, 5): (staff, been)


In [72]:
# function to generate samples
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0,
          shuffle=True)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.reshape(tf.constant([context_word], dtype="int64"), (1,1))
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [73]:
# generate training data
window_size = 5
num_ns = 4
vocab_size = len(vocab)
seed = 4212

targets, contexts, labels = generate_training_data(sequences=sequences,
                                                 window_size=window_size,
                                                 num_ns=num_ns,
                                                 vocab_size=vocab_size,
                                                 seed=seed)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f'targets shape: {targets.shape}')
print(f'contexts shape: {contexts.shape}')
print(f'labels shape: {labels.shape}')

100%|██████████| 200/200 [03:28<00:00,  1.04s/it]


targets shape: (423979,)
contexts shape: (423979, 5)
labels shape: (423979, 5)


In [74]:
# see what the data looks like
print(f'Example (1 data point)\nTargets: {targets[0]}')
print(f'Contexts: {contexts[0]}')
print(f'Labels: {labels[0]}')

# size of data
print(f'Total number of data: {len(targets)}')

Example (1 data point)
Targets: 199
Contexts: [  69   39 1613 1624 1086]
Labels: [1 0 0 0 0]
Total number of data: 423979


In [46]:
# function that generates One-Hot-Encoding vector
def get_input_OHE(i):
    e_i = np.zeros(v)
    e_i[i] = 1.
    return e_i

# function that takes in the output layer
# spits out a vector of same dimension as output layer
# sum of probabilites to 1
def softmax(z_c):
    return tf.nn.softmax(z_c).numpy()

def forward_pass(V_embedding, U_embedding, target, context, window_size):
    # OHE input vector and output vector
    x_c = get_input_OHE(target)
    pass

# sigmoid function
def sigmoid(x):
    """Inputs a real number, outputs a real number"""
    return 1 / (1 + jnp.exp(-x))

# 
def local_loss(target, context, label, V_embedding, U_embedding):
    """
    Input (example)
    target = (188,)
    context = (93, 40, 1648, 1659, 1109)
    label = (1, 0, 0, 0, 0)
    V_embedding: matrix of dim (n x |v|)
    U_embedding: matrix of dim (|v| x n)
    where n = embedding dimension, |v| = vocab size

    Outputs the local_loss -> real number
    """
    v_t = V_embedding.T[target] # shape (300,)
    u_pos = U_embedding[context[0]] # shape(300,)
    u_neg = U_embedding[context[1:]] # shape(4, 300)

    return -jnp.log(sigmoid(-jnp.dot(u_pos.T, v_t))) - jnp.sum(jnp.log(sigmoid(jnp.dot(u_neg, v_t))))


In [None]:
# set up
n = 100
v = len(vocab)
V = np.random.normal(0, 1, size=(n, v)) / np.sqrt(v)
U = np.random.normal(0, 1, size=(v, n)) / np.sqrt(n)

print(f'V shape: {V.shape}')
print(f'U shape: {U.shape}')