In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import tqdm
import jax
import jax.numpy as jnp
import string
import tensorflow as tf
import time
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

## Data Preprocessing

In [2]:
# read data
data = pd.read_csv('./data/raw data/raw_data.csv', header=0, names=['text'], usecols=[1])
print(f'Data Shape: {data.shape}')
data.head()

Data Shape: (13368, 1)


Unnamed: 0,text
0,"Sally Forrest, an actress-dancer who graced th..."
1,A middle-school teacher in China has inked hun...
2,A man convicted of killing the father and sist...
3,Avid rugby fan Prince Harry could barely watch...
4,A Triple M Radio producer has been inundated w...


In [3]:
# remove punctuation
punctuations = string.punctuation
def remove_punctuation(txt):
    for char in punctuations:
        if char in txt:
            txt = txt.replace(char, "")
    return txt

# change to lower caps
data['text'] = data['text'].str.lower()

# remove punctuations
data['text'] = data['text'].apply(remove_punctuation)

In [4]:
data.head()

Unnamed: 0,text
0,sally forrest an actressdancer who graced the ...
1,a middleschool teacher in china has inked hund...
2,a man convicted of killing the father and sist...
3,avid rugby fan prince harry could barely watch...
4,a triple m radio producer has been inundated w...


In [5]:
# print first entry
print(data['text'][0])

sally forrest an actressdancer who graced the silver screen throughout the 40s and 50s in mgm musicals and films such as the 1956 noir while the city sleeps died on march 15 at her home in beverly hills california forrest whose birth name was katherine feeney was 86 and had long battled cancer her publicist judith goffin announced the news thursday scroll down for video  actress sally forrest was in the 1951 ida lupinodirected film hard fast and beautiful left and the 1956 fritz lang movie while the city sleeps a san diego native forrest became a protege of hollywood trailblazer ida lupino who cast her in starring roles in films including the critical and commercial success not wanted never fear and hard fast and beautiful some of forrests other film credits included bannerline son of sinbad and excuse my dust according to her imdb page the page also indicates forrest was in multiple climax and rawhide television episodes forrest appeared as herself in an episode of the ed sullivan sho

In [6]:
# split each row into list of words
data_lst = data['text'].apply(lambda txt: txt.split(" "))

# select number of rows to be used as training data
nrows = 200
random_indices = np.random.randint(low=0, high=len(data_lst), size=nrows)
data_lst = data_lst[random_indices].reset_index(drop=True)

print(f'Number of rows of data: {len(data_lst)}')
data_lst[:5]

Number of rows of data: 200


0    [a, host, of, celebrities, including, kelly, o...
1    [tony, pulis, called, for, the, introduction, ...
2    [to, kill, a, mockingbird, author, harper, lee...
3    [university, of, houston, officials, have, sus...
4    [fresh, security, fears, have, been, raised, i...
Name: text, dtype: object

## Split Train and Test Sets

In [7]:
# split into train and test sets
n_test = 20
test_indices = np.random.randint(low=0, high=len(data_lst), size=n_test)
train_indices = [i for i in range(len(data_lst)) if i not in test_indices]

train_data = data_lst[train_indices]
test_data = data_lst[test_indices]

print(f'Number of rows of training data: {len(train_data)}')
print(f'Number of rows of test data: {len(test_data)}')

Number of rows of training data: 180
Number of rows of test data: 20


In [34]:
# vocab dict
vocab, index = {}, 1
vocab['<pad>'] = 0
for line in data_lst:
    for word in line:
        if word not in vocab:
            vocab[word] = index
            index += 1

# inverse_vocab dict
inverse_vocab = {}
for word, index in vocab.items():
    inverse_vocab[index] = word

print(f'Vocab size: {len(vocab)}')

Vocab size: 15063


In [37]:
# save vocab and inverse_vocab in processed_data folder
import json
with open('./data/processed_data/vocab.json', 'w') as f:
    json.dump(vocab, f)

with open('./data/processed_data/inverse_vocab.json', 'w') as f:
    json.dump(inverse_vocab, f)

In [35]:
# vocab_train dict
vocab_train, index = {}, 1
vocab_train['<pad>'] = 0
for line in train_data:
    for word in line:
        if word not in vocab_train:
            vocab_train[word] = index
            index += 1

# inverse_vocab dict
inverse_vocab_train = {}
for word, index in vocab_train.items():
    inverse_vocab_train[index] = word

print(f'Vocab size: {len(vocab_train)}')

Vocab size: 14119


In [9]:
# save vocab and inverse_vocab in processed_data folder
import json
with open('./data/processed_data/vocab_train.json', 'w') as f:
    json.dump(vocab_train, f)

with open('./data/processed_data/inverse_vocab_train.json', 'w') as f:
    json.dump(inverse_vocab_train, f)

In [36]:
# vocab_test dict
vocab_test, index = {}, 1
vocab_test['<pad>'] = 0
for line in test_data:
    for word in line:
        if word not in vocab_test:
            vocab_test[word] = index
            index += 1

# inverse_vocab dict
inverse_vocab_test = {}
for word, index in vocab_test.items():
    inverse_vocab_test[index] = word

print(f'Vocab size: {len(vocab_test)}')

Vocab size: 3382


In [11]:
# save vocab and inverse_vocab in processed_data folder
import json
with open('./data/processed_data/vocab_test.json', 'w') as f:
    json.dump(vocab_test, f)

with open('./data/processed_data/inverse_vocab_test.json', 'w') as f:
    json.dump(inverse_vocab_test, f)

In [12]:
# train sequences
train_sequences = []
for line in train_data:
    vectorized_line = [vocab_train[word] for word in line]
    train_sequences.append(vectorized_line)

# test sequences
test_sequences = []
for line in test_data:
    vectorized_line = [vocab_test[word] for word in line]
    test_sequences.append(vectorized_line)

In [13]:
# function to generate samples
def generate_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0,
          shuffle=True)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.reshape(tf.constant([context_word], dtype="int64"), (1,1))
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [27]:
# generate training data
window_size = 3
num_ns = 5
vocab_size = len(vocab_train)
seed = 4212

targets, contexts, labels = generate_data(sequences=train_sequences,
                                          window_size=window_size,
                                          num_ns=num_ns,
                                          vocab_size=vocab_size,
                                          seed=seed)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f'targets shape: {targets.shape}')
print(f'contexts shape: {contexts.shape}')
print(f'labels shape: {labels.shape}')

100%|██████████| 180/180 [01:50<00:00,  1.63it/s]


targets shape: (227036,)
contexts shape: (227036, 6)
labels shape: (227036, 6)


In [28]:
# generate testing data
window_size = 3
num_ns = 5
vocab_size = len(vocab_test)
seed = 4212

targets_test, contexts_test, labels_test = generate_data(sequences=test_sequences,
                                                         window_size=window_size,
                                                         vocab_size=vocab_size,
                                                         num_ns=num_ns,
                                                         seed=seed)

targets_test = np.array(targets_test)
contexts_test = np.array(contexts_test)
labels_test = np.array(labels_test)

print(f'targets_test shape: {targets_test.shape}')
print(f'contexts_test shape: {contexts_test.shape}')
print(f'labels_test shape: {labels_test.shape}')

100%|██████████| 20/20 [00:08<00:00,  2.31it/s]


targets_test shape: (16876,)
contexts_test shape: (16876, 6)
labels_test shape: (16876, 6)


In [30]:
# save training and testing data in data/processed/train and data/processed/test
np.save('./data/processed_data/train/num_ns_5/targets.npy', targets)
np.save('./data/processed_data/train/num_ns_5/contexts.npy', contexts)
np.save('./data/processed_data/train/num_ns_5/labels.npy', labels)

np.save('./data/processed_data/test/num_ns_5/targets.npy', targets_test)
np.save('./data/processed_data/test/num_ns_5/contexts.npy', contexts_test)
np.save('./data/processed_data/test/num_ns_5/labels.npy', labels_test)

**Sanity Check on quality of training and testing data**

In [31]:
# training data
print(f"target_index    : {targets[0]}")
print(f"target_word     : {inverse_vocab_train[targets[0]]}")
print(f"context_indices : {contexts[0]}")
print(f"context_words   : {[inverse_vocab_train[c] for c in contexts[0]]}")
print(f"label           : {labels[0]}")

print("target  :", targets[0])
print("context :", contexts[0])
print("label   :", labels[0])

target_index    : 196
target_word     : etc
context_indices : [ 195   38 1524 1534 1029  139]
context_words   : ['via', 'commend', 'event', 'moneymaking', 'becoming', 'lowkey']
label           : [1 0 0 0 0 0]
target  : 196
context : [ 195   38 1524 1534 1029  139]
label   : [1 0 0 0 0 0]


In [32]:
# testing data
print(f"target_index    : {targets_test[0]}")
print(f"target_word     : {inverse_vocab_test[targets_test[0]]}")
print(f"context_indices : {contexts_test[0]}")
print(f"context_words   : {[inverse_vocab_test[c] for c in contexts_test[0]]}")
print(f"label           : {labels_test[0]}")

print("target  :", targets[0])
print("context :", contexts[0])
print("label   :", labels[0])

print("target  :", targets_test[0])
print("context :", contexts_test[0])
print("label   :", labels_test[0])

target_index    : 64
target_word     : end
context_indices : [ 29  21 508 511 364  66]
context_words   : ['of', 'boxing', 'far', 'serious', 'says', 'in']
label           : [1 0 0 0 0 0]
target  : 196
context : [ 195   38 1524 1534 1029  139]
label   : [1 0 0 0 0 0]
target  : 64
context : [ 29  21 508 511 364  66]
label   : [1 0 0 0 0 0]
