# Assignment 4: Word Embedding Network
In this assignment you will practice how to create a Word Embedding Network in Tensorflow 2.0. First, you will finish some functions to parse the data, build the corpus and construct the skip pair. Then, you will construct a word embedding network by follow the specific requirements and architectures. Finally, you will train the network and visualize the result.

In [1]:
import operator
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization
from tensorflow.keras import Model

# split_sentence
1. Remove the special characters from the sentence
2. Filter the short sentence

You can rewrite this function or add more filter conditions

In [2]:
def split_sentence(sentences):
    new_sentence = list()
    for sentence in sentences:
        sentence = sentence.replace("\n", " ")
        sentence = sentence.replace(",", "")
        sentence = sentence.replace("\'", " ")
        sentence = sentence.replace("?", "")
        sentence = sentence.replace("!", "")
        sentence = sentence.replace(";", "")
        sentence = sentence.lower()
        if sentence.count(" ") <= 3:
            continue
        new_sentence.append(sentence)
    return new_sentence

Extract all the sentences from the input file. Split the input into each sentence by calling the "split_sentence" function.

- test_doc_short: Small dataset. You can use it to debug your code.

- test_doc_long: Large dataet. You should use it to get the final result.

In [3]:
file = open("test_doc_short.txt",'r')
raw_data_1 = file.read()
file.close()
sentences = raw_data_1.split(".")
print (len(sentences))
corpus_raw = list()                  
corpus_raw = split_sentence(sentences)
print (corpus_raw[:3])

7
['he is the king ', '  he is the royal king ', '  the king is royal ']


# build_dictionary (10 points)
1. Extract the word from the input. 
2. Build a non-duplicate word dictionary.

In [4]:
def build_dictionary(corpus_raw):
    words = []
    # TO DO
    
    # larger set that contains all the non-repeated words so far
    tempset = set([])
    
    for line in corpus_raw:
        temp = line.split(" ")
        for word in temp:
            # single set that only contains the current word
            singleset = set([])
            if word != "":
                singleset.add(word)
                # if single set is a subset of the larger set, means repeated, ignore
                if singleset.issubset(tempset):
                    continue
                else:
                    tempset.add(word)
                    words.append(word)

    # END TO DO
    return set(words)

In [5]:
corpus_dict = build_dictionary(corpus_raw)

- The number of word in "test_doc_long" dataset is around 7.
- The number of word in "test_doc_long" dataset is around 1831.

In [6]:
print (len(corpus_dict))

7


# one_hot_encoding (10 points)
1. Every word is represented as a vector containing 1 at its position in the vocabulary.

In [7]:
def one_hot_encoding(data_point_index, vocab_size):
    # TO DO

    # only the word index position is 1
    temp = np.zeros(vocab_size, dtype = int)
    temp[data_point_index] = 1


    # END TO DO
    return temp

# build_word_index_mapping (10 points)
1. Given a word, the function should return the index of this word in dictionary.
2. Given an index, the function should retrieve the word.

In [8]:
def build_word_index_mapping(corpus_dict):   
    # TO DO

    ind_2_word = []
    value_list = []
    index = 0
    
    # save words in ind_2_word, value_list is the list of indexes
    for word in corpus_dict:
        ind_2_word.append(word)
        value_list.append(index)
        index = index + 1
    
    # key is the word, value is the index
    word_2_ind = dict((key, value) for (key, value) in zip(ind_2_word, value_list))


    # END TO DO
    return word_2_ind, ind_2_word

In [9]:
word_2_ind, ind_2_word = build_word_index_mapping(corpus_dict)
print(corpus_dict)

{'the', 'royal', 'she', 'is', 'he', 'king', 'queen'}


Example Output:
    
1831
1831
1504

In [10]:
print (len(word_2_ind))
print (len(ind_2_word))
print (word_2_ind['he'])

7
7
4


In [11]:
sentences = corpus_raw

In [12]:
print(sentences[:3])

['he is the king ', '  he is the royal king ', '  the king is royal ']


In [13]:
WINDOW_SIZE = 2

# build_skip_pair (10 points)
1. Build the word pair with given window size.

In [14]:
def build_skip_pair(window_size, sentences):
    # TO DO

    data = []
    
    for line in sentences:
        
        words = []
        temp = line.split(" ")
        
        # words in one sentence
        for word in temp:
            if word != "":
                words.append(word)
                
        # build pairs for one sentence
        length = len(words)
        for i in range(length):
            pair = []
            for j in range(window_size):
                if (i-(window_size-j)) >= 0:
                    pair.append(words[i])
                    pair.append(words[i-(window_size-j)])
                    data.append(pair)
                    pair = []
            
            for j in range(window_size):
                if (i+(j+1)) < length:
                    pair.append(words[i])
                    pair.append(words[i+(j+1)])
                    data.append(pair)
                    pair = []


    # END TO DO
    return data

Example Input:

He is the king .

Example Output: 

[['he', 'is'], ['he', 'the'], ['is', 'he'], ['is', 'the'], ['is', 'king']]

In [15]:
data = build_skip_pair(WINDOW_SIZE, sentences)
print (len(data))
print (data[:5])

68
[['he', 'is'], ['he', 'the'], ['is', 'he'], ['is', 'the'], ['is', 'king']]


# build_train_data_label
1. Iterate all the word pairs in data
2. Construct the train and label data

In [16]:
def build_train_data_label(data, word_2_ind, vocab_size):
    x_train = [] 
    y_train = [] 
    for data_word in data:
        x_train.append(one_hot_encoding(word_2_ind[ data_word[0] ], vocab_size))
        y_train.append(one_hot_encoding(word_2_ind[ data_word[1] ], vocab_size))
    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)
    return x_train, y_train

In [17]:
x_train, y_train = build_train_data_label(data, word_2_ind, len(corpus_dict))

Preprocess the dataset with batch size 8.

In [18]:
print(x_train.shape, y_train.shape)

dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))

dataset = dataset.shuffle(100).batch(8)

(68, 7) (68, 7)


# MyEmbeddingModel (20 points)
1. init: Define all the layers you will use in the embedding network.
2. call: Define the network layer connectivity:
           - Fully connected with embedding_size/2 hidden neurons
           - Batchnormalization (optional)
           - Relu activation (optional)
           - Fully connected with embedding_size hidden neurons (This should be the word embedding output)
           - Batchnormalization (optional)
           - Relu activation (optional)
           - Fully connected that map to vocab_size output classes
           - Softmax (This should be the classification output)

In [19]:
class MyEmbeddingModel(Model):
  def __init__(self, embedding_size, vocab_size):
    super(MyEmbeddingModel, self).__init__()
    #Example:
    
    # TO DO
    
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.d2 = Dense(embedding_size)
    self.d3 = Dense(vocab_size, activation = 'softmax')
    
    # build embeddings
    self.embeddings = tf.Variable(tf.random.uniform([vocab_size, embedding_size], -1.0, 1.0))

    # END TO DO

  def call(self, x):
    #Example:
        
    # TO DO
    
    x_2 = self.d2(x)
    x_3 = self.d3(x_2)

    # END TO DO
    return x_2, x_3 
    

In [20]:
embedding_size = 300
vocab_size = len(corpus_dict)
model = MyEmbeddingModel(embedding_size, vocab_size)
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False)

# Optimizer (10 points)
- Implement the SGD optimizer
- Implement the RMSprop optimizer
- Implement the Adagrad optimizer
- Implement the Adadelta optimizer
- Implement the Adam optimizer (Use the Adam optimizer for training)

In [21]:
# Example: 
# optimizer = tf.keras.optimizers.Adamax()

# optimizer = #SGD optimizer
# optimizer = tf.keras.optimizers.SGD()

# optimizer = #RMSprop optimizer
# optimizer = tf.keras.optimizers.RMSprop()

# optimizer = #Adagrad optimizer
# optimizer = tf.keras.optimizers.Adagrad()

# optimizer = #Adadelta optimizer
# optimizer = tf.keras.optimizers.Adadelta()

# optimizer = #Adam optimizer
optimizer = tf.keras.optimizers.Adam()

Define the training step. Calculate the loss and optimize the weights.

In [22]:
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:

        _, predictions = model(inputs, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss = tf.reduce_mean(loss)

    return train_loss, labels, predictions

Example Output:

    Epoch 0, Loss: 6.393890857696533
    Epoch 1, Loss: 5.391612529754639
    Epoch 2, Loss: 4.996224403381348
    Epoch 3, Loss: 4.692948341369629
    Epoch 4, Loss: 4.473527908325195
    Epoch 5, Loss: 4.335629940032959
    Epoch 6, Loss: 4.251341342926025
    Epoch 7, Loss: 4.205460071563721
    Epoch 8, Loss: 4.172143936157227
    Epoch 9, Loss: 4.1499714851379395
    Epoch 10, Loss: 4.129685878753662
    ......

In [23]:
EPOCHS = 50

for epoch in range(EPOCHS):
    batch_loss = 0.0
    num_batch = 0
    for (batch, (inputs, labels)) in enumerate(dataset):
        train_loss, labels, predictions = train_step(inputs, labels)
        batch_loss += train_loss
        num_batch += 1
    template = 'Epoch {}, Loss: {}'
    print(template.format(epoch, batch_loss/num_batch))

Epoch 0, Loss: 1.9169862270355225
Epoch 1, Loss: 1.8538099527359009
Epoch 2, Loss: 1.783963918685913
Epoch 3, Loss: 1.7223995923995972
Epoch 4, Loss: 1.6854405403137207
Epoch 5, Loss: 1.6859817504882812
Epoch 6, Loss: 1.6435712575912476
Epoch 7, Loss: 1.5880597829818726
Epoch 8, Loss: 1.5867924690246582
Epoch 9, Loss: 1.5509724617004395
Epoch 10, Loss: 1.5349841117858887
Epoch 11, Loss: 1.5314093828201294
Epoch 12, Loss: 1.5248355865478516
Epoch 13, Loss: 1.537987232208252
Epoch 14, Loss: 1.486838459968567
Epoch 15, Loss: 1.4981067180633545
Epoch 16, Loss: 1.4918311834335327
Epoch 17, Loss: 1.459812879562378
Epoch 18, Loss: 1.4931857585906982
Epoch 19, Loss: 1.4732184410095215
Epoch 20, Loss: 1.4576481580734253
Epoch 21, Loss: 1.4823317527770996
Epoch 22, Loss: 1.486828088760376
Epoch 23, Loss: 1.4974586963653564
Epoch 24, Loss: 1.4479212760925293
Epoch 25, Loss: 1.4379048347473145
Epoch 26, Loss: 1.4544243812561035
Epoch 27, Loss: 1.4782058000564575
Epoch 28, Loss: 1.4226057529449463


# build_embedding_dict (10 points)
1. Iterate the corpus_dict and generate the embedding for each word.
2. Use the trained model to generate the word embedding with given one-hot embedding word.
3. Store the word and embedding in a dictionary. The key should be the word. The value should be the embedding vector.

In [24]:
def build_embedding_dict(model, corpus_dict):
    embeddings = dict()
    # TO DO
    
    for word in corpus_dict:
        word_id = word_2_ind[word]
        # get embeddings for the current word
        embed = model.embeddings[word_id]
        # save it in the dictionary
        embeddings[word] = embed

    # example (use the trained model to generate the word embedding with given one-hot embedding word): sample_embedding, _ = model(sample_one_hot)

    # END TO DO
    return embeddings

# euclidean_dist_np (10 points)
1. Calculate the Euclidean distance between two input vectors.

In [25]:
def euclidean_dist_np(vec1, vec2):
    dist = 0.0
    # TO DO

    # calculate the euclidean distance
    dist = np.linalg.norm(vec1-vec2)

    # END TO DO
    return dist

# find_closest (10 points)
1. Calculate the euclidean distance between the given word and all the words in embedding dictionary.
2. Sort the dictionary by value in ascending order.
3. Return the first three closet words.

In [26]:
def find_closest(word, embeddings):
    result = dict()
    # TO DO
    
    # save the euclidean distance between the current word's embeds and the target word's embeds in the dictionary
    for key in embeddings:
        result[key] = euclidean_dist_np(embeddings[key], embeddings[word])
    
    # sort the dictionary by value
    result_list = sorted(result.items(), key=lambda x: x[1]) 
    
    # END TO DO
    return result_list[:3]

Example Output: 

[('she', 0.0), ('he', 5.3993783), ('they', 5.7223315)]

In [27]:
embedding_dict = build_embedding_dict(model, corpus_dict)
print(find_closest('she', embedding_dict))

[('she', 0.0), ('king', 14.313944), ('he', 14.318885)]


# visualize_cluster
1. Visualize the word embedding in 2D space

In [None]:
def visualize_cluster(embedding_dict): 
    labels = []
    tokens = []
    for w in embedding_dict.keys():
        labels.append(w)
        tokens.append(embedding_dict[w])
    tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                         xy=(x[i], y[i]),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')
    plt.show()

In [None]:
visualize_cluster(embedding_dict)