Implement the most_similar() function from the chapter 9 code, and use it to run the six examples in the notebook. Include the output of these calls in your notebook.
 
IMPORTANT NOTE: the most_similar() function operates over actual words, whereas the embeddings you computed in problem 1 operate over transformer tokens. That is, each English word may consist of one or more tokens. To aggregate token embeddings into word embeddings, implement the following algorithm:
1. Take the glove_vocabulary.txt file and tokenize all the words in this file using the same tokenizer you used in the previous problem.
2. Compute a word embedding for all words in this file by averaging the corresponding token embeddings.

In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 2024

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 2024


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModel, AutoConfig

transformer_name="Tejas3/distillbert_base_uncased_80_equal"

config = AutoConfig.from_pretrained(transformer_name)
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)
model = AutoModel.from_pretrained(transformer_name, config=config)

model = model.to(device)


In [4]:
file = 'glove.6B.300d-vocabulary.txt'

vocabs = open(file).read().splitlines()
df = pd.DataFrame(vocabs, columns=['word'])
df

Unnamed: 0,word
0,the
1,","
2,.
3,of
4,to
...,...
399995,chanty
399996,kronik
399997,rolonda
399998,zsombor


In [5]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['word'] = Dataset.from_pandas(df)
ds

DatasetDict({
    word: Dataset({
        features: ['word'],
        num_rows: 400000
    })
})

In [6]:
def tokenize(batch):    
    return tokenizer(batch['word'], return_tensors='pt', padding=True, truncation=True)

In [7]:
tokenized = ds['word'].map(tokenize, batched=True)

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [8]:
type(tokenized)

datasets.arrow_dataset.Dataset

In [9]:
tokenized.to_pandas()

Unnamed: 0,word,input_ids,attention_mask
0,the,"[101, 1996, 102, 0, 0, 0]","[1, 1, 1, 0, 0, 0]"
1,",","[101, 1010, 102, 0, 0, 0]","[1, 1, 1, 0, 0, 0]"
2,.,"[101, 1012, 102, 0, 0, 0]","[1, 1, 1, 0, 0, 0]"
3,of,"[101, 1997, 102, 0, 0, 0]","[1, 1, 1, 0, 0, 0]"
4,to,"[101, 2000, 102, 0, 0, 0]","[1, 1, 1, 0, 0, 0]"
...,...,...,...
399995,chanty,"[101, 16883, 2100, 102, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
399996,kronik,"[101, 1047, 4948, 5480, 102, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
399997,rolonda,"[101, 20996, 7811, 2850, 102, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
399998,zsombor,"[101, 1062, 25426, 12821, 102, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"


In [10]:
from torch.nn.functional import normalize

word_embeddings = []

def process(batch):
    tokenized_word = tokenizer(batch['word'], padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model(**tokenized_word)

    # extract embedding for the batch
    hidden_state = output.last_hidden_state
    # word embedding = average of its contexutalized token embeddings
    word_emb = hidden_state[0].mean(dim=0)

    # normalize
    word_emb = normalize(word_emb, p=2, dim=0)

    # map word to its corresponding embedding
    word = batch['word']
    # word_embeddings[word] = word_emb
    word_embeddings.append(word_emb)    

In [11]:
ds.map(process)

# word_embeddings is an array of tensors--use torch.stack to create tensor of tensors
word_embeddings = torch.stack(word_embeddings)

# save as output file to avoid running it multiple times :(
# torch.save(word_embeddings, "word_embeddingsss.pt")
# word_embeddings = torch.load("word_embeddingsss.pt")

  word_embeddings = torch.load("word_embeddingsss.pt")


In [12]:
# switch to numpy array
word_embeddings = word_embeddings.cpu().detach().numpy()

In [13]:
index_to_key = {}
key_to_index = {}

for i, row in enumerate(ds['word']):
    word = row['word']
    index_to_key[i] = word
    key_to_index[word] = i

In [14]:
import numpy as np

def most_similar(word,topn=10):
    word_id = key_to_index[word]
    emb = word_embeddings[word_id]
    similarities = word_embeddings @ emb
    ids_ascending = similarities.argsort()
    ids_descending = ids_ascending[::-1]
    mask = ids_descending != word_id
    # obtain new array of indices that doesn't contain the word itself
    ids_descending = ids_descending[mask]
    top_ids = ids_descending[:topn]
    top_words = [(index_to_key[i], similarities[i]) for i in top_ids]
    return top_words 

In [15]:
most_similar("cactus")

[('cactuses', 0.91599166),
 ('pear', 0.9150654),
 ('spineflower', 0.910865),
 ('juniper', 0.90248454),
 ('shrub', 0.9016392),
 ('acacia', 0.8984697),
 ('shrubs', 0.89702),
 ('orchid', 0.89677685),
 ('pinaster', 0.8966114),
 ('berries', 0.8954787)]

In [16]:
most_similar('cake')

[('cakes', 0.94355893),
 ('cakebread', 0.92435),
 ('dessert', 0.9212095),
 ('pattycake', 0.90699303),
 ('cupcake', 0.904183),
 ('mooncake', 0.90096116),
 ('mooncakes', 0.8971993),
 ('desserts', 0.8965075),
 ('cakewalk', 0.8943194),
 ('pancakes', 0.89414)]

In [17]:
most_similar("angry")

[('frustrated', 0.94961965),
 ('enraged', 0.94812036),
 ('agitated', 0.9381641),
 ('irritated', 0.9356283),
 ('annoyed', 0.92857295),
 ('furious', 0.9282503),
 ('angering', 0.9281014),
 ('impatient', 0.9263562),
 ('ugly', 0.9241725),
 ('complaining', 0.92405325)]

In [18]:
most_similar("quickly")

[('immediately', 0.89024985),
 ('rapidly', 0.8540477),
 ('swiftly', 0.84985775),
 ('quick', 0.82023597),
 ('rún', 0.81379414),
 ('run', 0.81379414),
 ('prepare', 0.8123312),
 ('ensure', 0.81138396),
 ('hurry', 0.80761504),
 ('grab', 0.80675554)]

In [19]:
most_similar("between")

[('betweens', 0.8975253),
 ('in-between', 0.89026386),
 ('after', 0.8871184),
 ('thé', 0.87822294),
 ('thế', 0.87822294),
 ('the', 0.87822294),
 ('tō', 0.8777739),
 ('tổ', 0.8777739),
 ('tộ', 0.8777739),
 ('tô', 0.8777739)]

In [20]:
most_similar("the")

[('thé', 0.99999976),
 ('thế', 0.99999976),
 ('a', 0.93305206),
 ('à', 0.93305206),
 ('å', 0.93305206),
 ('ą', 0.93305206),
 ('á', 0.93305206),
 ('ã', 0.93305206),
 ('ă', 0.93305206),
 ('ä', 0.93305206)]

In [21]:
most_similar("hungry")

[('starving', 0.9208219),
 ('thirsty', 0.91829467),
 ('feral', 0.89709157),
 ('desperate', 0.8959713),
 ('starved', 0.88797283),
 ('rotting', 0.8857002),
 ('impatient', 0.8796371),
 ('sick', 0.8785124),
 ('unwilling', 0.8770848),
 ('needy', 0.87336016)]

In [22]:
most_similar("angry")

[('frustrated', 0.94961965),
 ('enraged', 0.94812036),
 ('agitated', 0.9381641),
 ('irritated', 0.9356283),
 ('annoyed', 0.92857295),
 ('furious', 0.9282503),
 ('angering', 0.9281014),
 ('impatient', 0.9263562),
 ('ugly', 0.9241725),
 ('complaining', 0.92405325)]

In [23]:
most_similar("ryang")

[('yung', 0.9123292),
 ('kwang', 0.9063474),
 ('seang', 0.9040199),
 ('jongg', 0.8955262),
 ('chayng', 0.89248216),
 ('boonsong', 0.89116025),
 ('jeng', 0.8898934),
 ('deliang', 0.8889395),
 ('leong', 0.8877564),
 ('alving', 0.88761723)]

In [24]:
most_similar("queen")

[('king', 0.92228246),
 ('monarch', 0.9065795),
 ('queenan', 0.89665383),
 ('princess', 0.89159346),
 ('consort', 0.88092244),
 ('princessa', 0.8798687),
 ('kings', 0.8774574),
 ('kinga', 0.8680315),
 ('queen-consort', 0.86543465),
 ('queening', 0.86539793)]

In [25]:
most_similar("king")

[('queen', 0.92228246),
 ('kinga', 0.9129591),
 ('kings', 0.908782),
 ('kingii', 0.87946886),
 ('monarch', 0.8785908),
 ('kinglets', 0.8720195),
 ('kingo', 0.86991453),
 ('kingsize', 0.86771184),
 ('prince', 0.8656887),
 ('queenan', 0.8655345)]

In [26]:
most_similar("prince")

[('princess', 0.93490076),
 ('princes', 0.92972887),
 ('princesa', 0.88537294),
 ('princelings', 0.88534784),
 ('princessa', 0.8757585),
 ('king', 0.8656887),
 ('princeling', 0.86075485),
 ('babys', 0.8541628),
 ('baby', 0.85275245),
 ('beau', 0.8513527)]