# Evaluate Models

**Outline**

1. Read the top 2 word embeddings only
2. Plot epoch_losses, train_accuracies, train_mses, test_accuracies, test_mses
3. Using the best model, select 2 pairs of most similar words
    * plot t-sne
    * plot PCA
4. Pass 2 target words through 1 forward pass of the skip-gram model
    * select top 6 context words

| Model Used | Num samples used | Num of Neg Samples | Train Accuracy | Test Accuracy | Train MSE | Test MSE |
|:-----------|:-----------------|:-------------------|:---------------|:--------------|:----------|:---------|
| SGNS_1 | 1M - 100 epochs, 10,000 iter | 5 | 

In [None]:
import numpy as np
import pandas as pd
import tqdm
import jax
import jax.numpy as jnp
import string
import tensorflow as tf
import time
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

# import libraries relevant to plotting (t-SNE)
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

#### Define all the functions

In [None]:
# define cosine similarity scores between 2 word vectors
def similarity_score(target_word_embedding, context_word_embedding):
    return np.dot(target_word_embedding, context_word_embedding) / (np.linalg.norm(target_word_embedding) * np.linalg.norm(context_word_embedding))

# define a function that find the most similar words to a given word
def most_similar_words(word, V, n=5):
    scores = []
    target_word_idx = vocab[word]
    for i in range(V.shape[1]):
        if i == target_word_idx or inverse_vocab[i] == '<pad>':
            continue
        scores.append((inverse_vocab[i], similarity_score(V[:, target_word_idx], V[:, i])))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores[:n]

In [None]:
# compute a forward pass through the skip-gram model
# define function that takes in an index and vocab size and returns the one-hot encoding
def getOneHot(index, vocab_size):
    onehot = np.zeros(vocab_size)
    onehot[index] = 1
    return onehot

# define softmax function
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

# define the forward pass function
def net(V, U, target_word_idx):
    target_hot = getOneHot(target_word_idx, len(vocab))
    return softmax( U @ V @ target_hot )

def predict(word, V, U):
    target_word_idx = vocab[word]
    y_hat = net(V, U, target_word_idx)
    # y_hat is the probability distribution over the vocab
    # select the top 5 words with the highest probability
    top_5 = np.argsort(y_hat)[-5:][::-1]
    top_5_words = [inverse_vocab[i] for i in top_5]
    return top_5_words

In [None]:
# read vocab and inverse vocab json files from data/processed_data
import json
with open('./data/processed_data/vocab.json') as f:
    vocab = json.load(f)

with open('./data/processed_data/inverse_vocab.json') as f:
    inverse_vocab = json.load(f)

# read top 2 word embeddings
    
# read their corresponding results

# check shape of V and U and results

In [None]:
# select n random words from the vocab list
n = 5
random_words = np.random.choice(list(vocab.keys()), n)
print(f'Random words: {random_words}')

# get the embedding vectors of the random words
random_words_idx = [vocab[word] for word in random_words]

# find the most similar word to the random words
for i, word in enumerate(random_words):
    print(f"Most similar word to '{word}': {most_similar_words(random_words_idx[i], V_trained)}")

In [None]:
# plot t-SNE
tsne = TSNE(n_components=2, random_state=0)
V_trained_tsne = tsne.fit_transform(V_trained.T)

# plot only selected words in the list
target_words = ['produced', 'lie', 'chromecast', 'evolved', 'closed']
similar_words = [most_similar_word(vocab[word], V_trained) for word in words]

target_indices = [vocab[word] for word in words]
similar_indices = [vocab[word] for word in similar_words]
V_trained_tsne_target_words = V_trained_tsne[target_indices]
V_trained_tsne_similar_words = V_trained_tsne[similar_indices]


In [None]:
# plot the t-sne graph, labelling the data point with the words
plt.figure(figsize=(5, 5))
plt.scatter(V_trained_tsne_target_words[:, 0], V_trained_tsne_target_words[:, 1], color='blue')
plt.scatter(V_trained_tsne_similar_words[:, 0], V_trained_tsne_similar_words[:, 1], color='red')

# label the data points with its corresponding words slightly above each data point
for i, word in enumerate(words):
    x, y = V_trained_tsne_target_words[i]
    plt.text(x, y+0.1, word, fontsize=9)

    x, y = V_trained_tsne_similar_words[i]
    plt.text(x, y+0.1, similar_words[i], fontsize=9)

In [None]:
# see first 20 words in the vocab
test_words = list(vocab.keys())[20:40]
test_words

In [None]:
# randomly select 1 word from vocab
word = np.random.choice(list(vocab.keys()))
y_hat = net(V_trained, U_trained, vocab[word])
print(f'Word: {word}')
predict(word, V_trained, U_trained)

**Sanity Check for Quality of Embeddings**

* Cosine Similarity Check
    * Select a few words from the dictionary
    * Find the most similar word (use cosine rule)

* t-SNE plot
    * e.g. king and queen vs cars and trucks
    * expect king and queen to be closer together and cars and trucks to be closer together