<a href="https://colab.research.google.com/github/iannwtf19/iannwtf19/blob/main/Week10/Homework10_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import os
import tqdm
import datetime

# Data preparation
## Load file
First let's mount Google Drive, where we keep our input file (the bible).
If you are working locally & don't need Google Drive you may skip this cell.
If the file does not exist yet, we will create it.

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
os.chdir("drive/MyDrive/tensorflow")

Mounted at /content/drive


Next, we download the file if it doesn't already exist.
Finally, open and read it.

In [4]:
import urllib.request
filename = 'bible.txt'
if not os.path.isfile(filename):
    urllib.request.urlretrieve('https://raw.githubusercontent.com/iannwtf19/iannwtf19/main/Week10/bible.txt', 'bible.txt')

In [5]:
# Open input file
text_file = open("bible.txt")
# Read the text file as a string (a sequence of characters)
corpus = text_file.read()

Let's see what the text looks like

In [6]:
print(corpus[:200])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the


## Tokenization
Convert the corpus into sequence of word tokens. Default settings have filters and makes each token lowercase. We give vocab_size as 10000, which will tokenize the most common 10000 words.

In [None]:
vocab_size = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=vocab_size,
    lower=True,
    split=' ',
    char_level=False,
    analyzer=None
)
# Generate vocabulary from the corpus
tokenizer.fit_on_texts([corpus])
# A dictionary of "word: index" entries
vocab = tokenizer.word_index
# Reversed dictionary of "index: word" entries, for fetching the word from the index
inverse_vocab = {index: token for token, index in vocab.items()}
# Just the list of words
word_tokens = list(vocab.keys())

In [None]:
print(f'First 10 elements of vocabulary: {list(vocab.items())[:10]}')

First 10 elements of vocabulary: [('the', 1), ('and', 2), ('of', 3), ('to', 4), ('that', 5), ('in', 6), ('he', 7), ('shall', 8), ('unto', 9), ('for', 10)]


## Generate skip-gram pairs

In [None]:
corpus_sequences = tokenizer.texts_to_sequences([corpus])
window_size = 2
# Generate sampling table for subsampling
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
# Generate tuples of (target_word_index, context_word_index) for the most common 10000 words
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
    corpus_sequences[0],
    vocabulary_size=vocab_size,
    window_size=window_size,
    sampling_table=sampling_table,
    negative_samples=0)

In [None]:
# Print some positive samples
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(128, 3220): (o, sir)
(26, 511): (was, commandment)
(210, 21): (saw, it)
(2861, 194): (star, time)
(410, 97): (prophets, let)


## Generate dataset

In [None]:
batch_size = 256
skipgram_array = np.array(positive_skip_grams)
# Convert [target_word_index, context_word_index] into tuples of (target_word_index, context_word_index)
target_index_slices = skipgram_array[:, 0]
context_index_slices = skipgram_array[:, 1]
skipgram_ds = tf.data.Dataset.from_tensor_slices((target_index_slices, context_index_slices))
skipgram_ds = skipgram_ds.cache().shuffle(10000).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

# Creating the model
## NCE Layer
First we define a layer that will compute the NCE loss. This layer will be called to get the loss.

In [None]:
class NCELayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_size, num_neg_samples):
        super(NCELayer, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_neg_samples = num_neg_samples

    def build(self, input_shape):
        # initialize weights and biases
        self.nce_weights = self.add_weight(
            name="nce_weights",
            shape=(self.vocab_size, self.embedding_size),
            initializer="glorot_normal",
        )

        self.nce_bias = self.add_weight(
            name="nce_biases", shape=(self.vocab_size,), initializer="zeros"
        )
        super(NCELayer, self).build(input_shape)

    def call(self, data):
        # Calculate and return NCE loss
        true_classes, embeddings = data

        loss = tf.reduce_mean(tf.nn.nce_loss(
            self.nce_weights, self.nce_bias, tf.reshape(true_classes, (-1, 1)), embeddings, self.num_neg_samples,
            self.vocab_size
        ))

        self.add_loss(loss)

        return loss

## NLP Model
The full model with an embedding layer and an NCE layer. The embedding layer will hold the word embeddings.

In [None]:
class NLPModel(tf.keras.Model):

    def __init__(self, vocab_size, embedding_size, num_neg_samples, optimizer):
        super(NLPModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_neg_samples = num_neg_samples
        self.optimizer = optimizer

        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]

        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=1)
        self.nce_layer = NCELayer(vocab_size, embedding_size, num_neg_samples)

    def call(self, inputs):
        # Return embedding vector for a given word index
        return self.embedding_layer(inputs)

    @tf.function
    def train_step(self, data):
        target_word_indices, context_word_indices = data
        with tf.GradientTape() as tape:
            embeddings = self.embedding_layer(target_word_indices)
            loss = self.nce_layer((context_word_indices, embeddings))

        # Calculate & apply gradients, update metrics
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        self.metrics[0].update_state(loss)

        return {metric.name: metric.result() for metric in self.metrics}

    def validate_step(self, val_indices):
        # Get embeddings for the given validation words
        val_embeddings = self.embedding_layer(val_indices)
        # Normalize embedding matrix for similarity calculation
        normalized_val_embeddings = tf.nn.l2_normalize(val_embeddings, 1)

        # The full embedding matrix is the weights of the embedding layer
        embeddings = self.embedding_layer.get_weights()[0]
        # Normalize embedding matrix for similarity calculation
        normalized_embeddings = tf.nn.l2_normalize(embeddings, 1)

        similarities = tf.matmul(normalized_val_embeddings, normalized_embeddings, transpose_b=True)

        return similarities

    @property
    def metrics(self):
        return self.metrics_list

    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()

# Training Loop

In [None]:
def training_loop(ds, model, epochs, test_indices, summary_writer):
    for epoch in range(epochs):
        print(f"Epoch #{epoch}: ")

        for data in tqdm.tqdm(ds):
            metrics = model.train_step(data)

            with summary_writer.as_default():
                for metric in model.metrics:
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # Print training metrices for the epoch
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])

        model.reset_metrics()

        # calculate cosine similarities
        k = 5
        test_data = tf.constant(test_indices, dtype=tf.int32)
        similarities = model.validate_step(test_data).numpy()
        for i, word_index in enumerate(test_indices):
            word = inverse_vocab[word_index]
            similarities_of_word = similarities[i]
            # Reverse array to get in descending order & start from 1 to skip the same word (itself)
            top_k_indices = np.argsort(-similarities_of_word)[1:k+1]
            # Convert top-k word indexes to actual words and print
            top_k_words = [inverse_vocab[j] for j in top_k_indices]
            print(f'Closest neighbors of {word}: {top_k_words}')

In [None]:
def create_summary_writer(config_name):
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    log_path = f"logs/skipgram/{config_name}/{current_time}"
    summary_writer = tf.summary.create_file_writer(log_path)

    return summary_writer

# Training

In [None]:
# create a tf data set with the indices of our validation words
test_words = ["holy", "father", "wine", "poison", "love", "night", "day"]
test_indices = [vocab[test_word] for test_word in test_words]

vocabulary_size = len(vocab)
embedding_size = 64
epochs = 15
num_negative_samples = 4

optimizer = tf.optimizers.Adam(0.001)
summary_writer = create_summary_writer(config_name=f'RUN')

model = NLPModel(optimizer=optimizer,
                 embedding_size=embedding_size,
                 vocab_size=vocabulary_size,
                 num_neg_samples=num_negative_samples)

training_loop(skipgram_ds, model, epochs, test_indices, summary_writer)

Epoch #0: 


  0%|          | 0/1407 [00:00<?, ?it/s]2023-02-13 20:37:40.774881: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
100%|██████████| 1407/1407 [00:21<00:00, 64.98it/s] 


['loss: 21.354496002197266']
Closest neighbors of holy: ['also', 'fire', 'men', 'moses', 'they']
Closest neighbors of father: ['these', '27', 'neither', 'my', '8']
Closest neighbors of wine: ['through', 'but', 'heaven', 'cloud', 'who']
Closest neighbors of poison: ['own', 'throne', 'before', 'caused', 'sought']
Closest neighbors of love: ['let', '13', 'on', 'thine', 'that']
Closest neighbors of night: ['cry', 'fear', 'hast', 'faith', 'our']
Closest neighbors of day: ['12', 'thy', 'this', 'an', '36']
Epoch #1: 


100%|██████████| 1407/1407 [00:13<00:00, 101.94it/s]


['loss: 18.73204231262207']
Closest neighbors of holy: ['also', 'jews', 'wilderness', 'fire', 'side']
Closest neighbors of father: ['fathers', '27', 'these', '8', '33']
Closest neighbors of wine: ['through', 'cloud', 'offered', 'hundred', 'length']
Closest neighbors of poison: ['afar', 'selah', 'caesar', 'recorder', 'chamber']
Closest neighbors of love: ['own', 'let', 'cast', 'old', 'yet']
Closest neighbors of night: ['faith', 'fear', 'even', 'offered', 'strength']
Closest neighbors of day: ['earth', 'people', 'name', 'king', 'court']
Epoch #2: 


100%|██████████| 1407/1407 [00:13<00:00, 104.63it/s]


['loss: 16.837984085083008']
Closest neighbors of holy: ['jews', 'wilderness', 'side', 'sword', 'border']
Closest neighbors of father: ['fathers', 'burnt', 'own', '33', 'silver']
Closest neighbors of wine: ['through', 'five', 'offered', 'offerings', 'length']
Closest neighbors of poison: ["badgers'", 'selah', 'pound', 'recorder', 'afar']
Closest neighbors of love: ['own', 'prosper', 'wherewith', 'going', 'destroy']
Closest neighbors of night: ['strength', 'fear', 'faith', 'trees', 'offered']
Closest neighbors of day: ['land', 'law', 'earth', 'king', 'people']
Epoch #3: 


100%|██████████| 1407/1407 [00:12<00:00, 109.88it/s]


['loss: 15.496509552001953']
Closest neighbors of holy: ['angel', 'east', 'border', 'wilderness', 'jews']
Closest neighbors of father: ['fathers', 'chariots', 'strength', 'own', 'righteousness']
Closest neighbors of wine: ['through', 'five', 'offered', 'silver', 'beast']
Closest neighbors of poison: ["badgers'", 'weapons', 'pound', 'selah', 'perdition']
Closest neighbors of love: ['wherewith', 'nor', 'prosper', 'lovingkindness', 'yet']
Closest neighbors of night: ['shields', 'trees', 'fear', 'strength', 'hittite']
Closest neighbors of day: ['law', 'king', 'earth', 'name', 'door']
Epoch #4: 


100%|██████████| 1407/1407 [00:13<00:00, 107.16it/s]


['loss: 14.040384292602539']
Closest neighbors of holy: ['east', 'jews', 'other', 'river', 'poor']
Closest neighbors of father: ['strength', 'chariots', 'fathers', 'anger', 'righteousness']
Closest neighbors of wine: ['oil', 'pharisees', 'five', 'beast', 'spoil']
Closest neighbors of poison: ["badgers'", 'gourd', 'bani', 'mary', 'notwithstanding']
Closest neighbors of love: ['wherewith', 'lovingkindness', 'also', 'but', 'or']
Closest neighbors of night: ['shields', 'even', 'trees', 'order', 'belly']
Closest neighbors of day: ['law', 'month', 'wilderness', 'door', 'morning']
Epoch #5: 


100%|██████████| 1407/1407 [00:14<00:00, 95.84it/s] 


['loss: 12.850244522094727']
Closest neighbors of holy: ['fire', 'wine', 'other', 'evening', 'canaanites']
Closest neighbors of father: ['strength', 'wrath', 'mother', 'fathers', 'brother']
Closest neighbors of wine: ['beast', 'oil', 'pharisees', 'ass', 'houses']
Closest neighbors of poison: ['bani', "badgers'", 'gourd', 'carved', 'unfruitful']
Closest neighbors of love: ['also', 'wherewith', 'caused', 'but', 'forget']
Closest neighbors of night: ['order', 'even', 'young', 'trees', 'knowledge']
Closest neighbors of day: ['waters', 'morning', 'gentiles', 'lord', 'law']
Epoch #6: 


100%|██████████| 1407/1407 [00:13<00:00, 106.45it/s]


['loss: 11.992533683776855']
Closest neighbors of holy: ['wine', 'fire', 'goodness', 'corn', 'reubenites']
Closest neighbors of father: ['brother', 'mother', 'wrath', 'strength', 'face']
Closest neighbors of wine: ['beast', 'ass', 'houses', 'oil', 'pharisees']
Closest neighbors of poison: ['bani', 'gall', 'ittai', 'loss', 'gourd']
Closest neighbors of love: ['also', 'thought', 'caused', 'scorn', 'forget']
Closest neighbors of night: ['order', 'famine', 'giving', 'young', 'knowledge']
Closest neighbors of day: ['time', 'morning', 'lord', 'gentiles', 'waters']
Epoch #7: 


100%|██████████| 1407/1407 [00:13<00:00, 102.76it/s]


['loss: 11.064269065856934']
Closest neighbors of holy: ['fast', 'trees', 'wine', 'lamps', 'corn']
Closest neighbors of father: ['mother', 'brother', 'wrath', 'neighbour', 'servant']
Closest neighbors of wine: ['shemaiah', 'ass', 'pharisees', 'women', 'beast']
Closest neighbors of poison: ['bani', 'unfruitful', 'encouraged', 'declaration', 'gileadites']
Closest neighbors of love: ['also', 'thought', 'caused', 'testify', 'wherewith']
Closest neighbors of night: ['famine', 'twilight', 'order', 'giving', 'remained']
Closest neighbors of day: ['time', 'chaldeans', 'lord', 'morning', 'waters']
Epoch #8: 


100%|██████████| 1407/1407 [00:13<00:00, 105.50it/s]


['loss: 10.462347030639648']
Closest neighbors of holy: ['fleshhooks', 'lamps', 'babylonians', 'coupling', 'firepans']
Closest neighbors of father: ['mother', 'brother', 'neighbour', 'wrath', 'servant']
Closest neighbors of wine: ['honey', 'fleshhooks', 'shemaiah', 'women', 'ass']
Closest neighbors of poison: ['bani', 'cage', 'lentiles', 'organ', 'keys']
Closest neighbors of love: ['also', 'thought', 'caused', 'wherewith', 'formed']
Closest neighbors of night: ['order', 'twilight', 'assyrian', 'remained', 'famine']
Closest neighbors of day: ['time', 'chaldeans', 'stranger', 'place', 'lord']
Epoch #9: 


100%|██████████| 1407/1407 [00:13<00:00, 104.00it/s]


['loss: 9.547640800476074']
Closest neighbors of holy: ['fleshhooks', 'spoiler', 'abomination', 'jezreelitess', 'queen']
Closest neighbors of father: ['mother', 'brother', 'neighbour', 'wrath', 'servants']
Closest neighbors of wine: ['honey', 'trumpeters', 'soothsayers', 'shemaiah', 'dry']
Closest neighbors of poison: ['keys', 'lentiles', 'organ', 'grape', 'shimri']
Closest neighbors of love: ['also', 'thought', 'wherewith', 'caused', 'direct']
Closest neighbors of night: ['twilight', 'fly', 'due', 'gardens', 'hedges']
Closest neighbors of day: ['time', 'testimony', 'chaldeans', 'hill', 'word']
Epoch #10: 


100%|██████████| 1407/1407 [00:13<00:00, 106.35it/s]


['loss: 9.301558494567871']
Closest neighbors of holy: ['spoiler', 'abomination', 'fleshhooks', 'queen', 'windows']
Closest neighbors of father: ['brother', 'mother', 'servants', 'wrath', 'head']
Closest neighbors of wine: ['honey', 'trumpeters', 'carpenters', 'soothsayers', 'passengers']
Closest neighbors of poison: ['keys', 'grape', 'revenue', 'bani', 'argob']
Closest neighbors of love: ['also', 'wherewith', 'caused', 'thought', 'acknowledge']
Closest neighbors of night: ['due', 'fly', 'remained', 'diligence', 'twilight']
Closest neighbors of day: ['time', 'word', 'testimony', 'stranger', 'chaldeans']
Epoch #11: 


100%|██████████| 1407/1407 [00:14<00:00, 97.78it/s] 


['loss: 8.841202735900879']
Closest neighbors of holy: ['spoiler', 'abomination', 'disobedient', 'windows', 'gardens']
Closest neighbors of father: ['mother', 'brother', 'servants', 'neighbour', 'name']
Closest neighbors of wine: ['honey', 'carpenters', 'soothsayers', 'trumpeters', 'wizards']
Closest neighbors of poison: ['keys', 'pen', 'barrel', 'barachel', 'fowler']
Closest neighbors of love: ['also', 'wherewith', 'acknowledge', 'thought', 'caused']
Closest neighbors of night: ['fly', 'due', 'dwellest', 'hedges', 'remained']
Closest neighbors of day: ['time', 'testimony', 'stranger', 'word', 'place']
Epoch #12: 


100%|██████████| 1407/1407 [00:14<00:00, 99.48it/s] 


['loss: 8.26272201538086']
Closest neighbors of holy: ['spoiler', 'abomination', 'poor', 'windows', 'disobedient']
Closest neighbors of father: ['mother', 'brother', 'servants', 'neighbour', 'name']
Closest neighbors of wine: ['honey', 'carpenters', 'trumpeters', 'soothsayers', 'wizards']
Closest neighbors of poison: ['keys', 'eliam', 'jawbone', 'barrel', 'barachel']
Closest neighbors of love: ['also', 'command', 'hear', 'formed', 'acknowledge']
Closest neighbors of night: ['famine', 'hedges', 'due', 'fly', 'dwellest']
Closest neighbors of day: ['time', 'testimony', 'word', 'stranger', 'place']
Epoch #13: 


100%|██████████| 1407/1407 [00:14<00:00, 98.62it/s] 


['loss: 7.806337356567383']
Closest neighbors of holy: ['spoiler', 'poor', 'abomination', 'windows', 'vintage']
Closest neighbors of father: ['brother', 'mother', 'servants', 'neighbour', 'name']
Closest neighbors of wine: ['wizards', 'carpenters', 'honey', 'ashdod', 'trumpeters']
Closest neighbors of poison: ['eliam', 'jawbone', 'keys', 'barrel', 'errors']
Closest neighbors of love: ['also', 'command', 'hear', 'acknowledge', 'wherewith']
Closest neighbors of night: ['gardens', 'hedges', 'dwellest', 'bethel', 'famine']
Closest neighbors of day: ['time', 'testimony', 'word', 'chaldeans', 'trusteth']
Epoch #14: 


100%|██████████| 1407/1407 [00:14<00:00, 97.56it/s] 

['loss: 7.411426067352295']
Closest neighbors of holy: ['spoiler', 'vintage', 'abomination', 'youngest', 'poor']
Closest neighbors of father: ['brother', 'mother', 'servants', 'name', 'servant']
Closest neighbors of wine: ['wizards', 'madness', 'households', 'lud', 'carpenters']
Closest neighbors of poison: ['fowler', 'jawbone', 'eliam', 'wedge', 'navy']
Closest neighbors of love: ['also', 'command', 'acknowledge', 'tempt', 'hear']
Closest neighbors of night: ['dwellest', 'bethel', 'gardens', 'hedges', 'twilight']
Closest neighbors of day: ['time', 'testimony', 'word', 'are', 'remaineth']



