In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import pickle
import os
import numpy as np
import pandas as pd
import torch
import sys
import tqdm as notebook_tqdm
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from word2vec import *

## Loading Model and Vocabulary

In [22]:
model_class =  "SkipGramModel"  # "SkipGramModel"  "CBOWModel"
ds_name = "wikitext-2-v1"   # "wikitext-103-v1"   "wikitext-2-v1"
dir_path = f"{model_class}_{ds_name}_data"
model_path = os.path.join(dir_path, "model.pt") 
vocab_path = os.path.join(dir_path, "vocab.pkl") 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.load(model_path, map_location=device)
with open(vocab_path, 'rb') as file:
    vocab = pickle.load(file)


## Getting Embeddings

In [23]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(4072, 300)

# Visualization with t-SNE

In [24]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.idx_to_word

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [25]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html(f"./{dir_path}/word2vec_visualization.html")

# Find Similar Words

In [26]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab.word_to_idx.get(word, 0)
    if word_id == 0:
        print("Out of vocabulary word")
        return {}

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.idx_to_word[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [27]:
for word, sim in get_top_similar("god").items():
    print("{}: {:.3f}".format(word, sim))

goddess: 0.565
shiva: 0.535
divine: 0.517
deity: 0.496
khandoba: 0.496
allah: 0.491
spirit: 0.490
arabic: 0.462
kitsune: 0.459
holy: 0.457


# Vector Equations

In [28]:
emb1 = embeddings[vocab.word_to_idx.get("king", 0)]
emb2 = embeddings[vocab.word_to_idx.get("man", 0)]
emb3 = embeddings[vocab.word_to_idx.get("woman", 0)]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:10]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.idx_to_word[word_id], dists[word_id]))

king: 0.617
woman: 0.495
daughter: 0.414
goddess: 0.378
maria: 0.370
khandoba: 0.369
shiva: 0.353
son: 0.353
saint: 0.348
edward: 0.335


In [26]:
len(vocab.idx_to_word)

4072