In [1]:
import torch
import torch.nn as nn
import os

from torch.utils.data import DataLoader
from torchtext.datasets import WikiText2

from torchtext.vocab import vocab
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer

from functools import partial

import numpy as np

import sys
sys.path.append("../utils")
import constants as CONSTANTS
import dataset
import models
import helper

import importlib
importlib.reload(CONSTANTS)
importlib.reload(models)
importlib.reload(dataset)
importlib.reload(helper)

  from .autonotebook import tqdm as notebook_tqdm


<module 'helper' from '/Users/khalid/personal_nlp_playground/word2vec/notebooks/../utils/helper.py'>

# Load everything

In [3]:

model_name = "cbow"
path = f"../weights/{model_name}/model.pth"
train_loader,vocab = dataset.get_data_loader_and_vocab(model_name,"train",batch_size=32,shuffle=True,vocab=None)
model = helper.get_model_by_name(model_name,vocab_size = len(vocab))

In [4]:
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# get the embeddings

In [34]:
embeds = list(model.parameters())[0].detach().clone()

# normalization
norms = (embeds ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeds / norms
embeddings_norm.shape

torch.Size([2175, 300])

# get most simillar words

In [37]:
def get_most_n_similar_words(word,n=5):
    word_idx = vocab[word]
    if word_idx == 0:
        raise Exception("Out of vocabulary word, try another one")
    #
    word_vec = embeds[word_idx]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    sim_matrix = np.matmul(embeds,embeds.t())
    
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : n + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [56]:
get_most_n_similar_words("center",n=5)

{'units': tensor(4.4426),
 'part': tensor(3.8911),
 'club': tensor(3.8699),
 'tower': tensor(3.6878),
 'production': tensor(3.5750)}

# vector equations

In [58]:
emb1 = embeds[vocab["king"]]
emb2 = embeds[vocab["man"]]
emb3 = embeds[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

king: 0.617
woman: 0.528
poetry: 0.205
henry: 0.195
location: 0.187
