# Word Embeddings
- numerical representation of words

### GloVe: Global Vectors for Word Representation|
- An unsupervised learning algorithm for obtaining vector representations for words
- "The training objective of GloVe is to learn word vectors such that their dot product equals the logarithm of the words’ probability of co-occurrence." - taken from the GloVe project page.
- First we need a cooccurrence matrix X from the corpus/text -> matrix X such that X_{ij} shows us number of times i appears in context of j => n-gram sliding window

# Generating Cooccurence Matrix

In [144]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import nltk
from collections import Counter
from nltk.util import ngrams
from sklearn.preprocessing import normalize
import string

In [124]:
# Load the Genesis dataset and tokenize it
corpus = nltk.corpus.genesis.words()[0:300]
window_size = 5  # The size of the context window

# Generate co-occurrence matrix
def create_cooccurrence_matrix(corpus, window_size, vocab_size):
    corpus = [word.lower() for word in corpus if word not in string.punctuation]
    word_to_idx = {word: idx for idx, word in enumerate(set(corpus))}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    vocab_size = len(word_to_idx)
    cooccurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)

    
    for i in range(len(corpus)):
        target_word = corpus[i]
        target_idx = word_to_idx[target_word]
        
        # Define the window context range
        start = max(i - window_size, 0)
        end = min(i + window_size, len(corpus))
        
        # Count co-occurrences in the context window
        for j in range(start, end):
            if i != j:
                context_word = corpus[j]
                context_idx = word_to_idx[context_word]
                cooccurrence_matrix[target_idx, context_idx] += 1
    
    return cooccurrence_matrix, word_to_idx, idx_to_word

cock_mat, word_to_idx, idx_to_word = create_cooccurrence_matrix(corpus, window_size, vocab_size)


# Define Pytorch Model

In [130]:
vocab_size = cock_mat.shape[0]
embedding_dim = 100  # Dimension of word embeddings
learning_rate = 0.1
epochs = 20
batch_size = 1024

class GloVeModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim,  alpha=0.75, x_max=100):
        super(GloVeModel, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.bias_wi = nn.Embedding(vocab_size, 1)
        self.bias_wj = nn.Embedding(vocab_size, 1)
        self.alpha = alpha
        self.x_max = x_max

    def forward(self, wi, wj):
        wi_embed = self.word_embeddings(wi)
        wj_embed = self.context_embeddings(wj)
        wi_bias = self.bias_wi(wi)  # [batch_size, 1]
        wj_bias = self.bias_wj(wj)  # [batch_size, 1]
        return torch.sum(wi_embed * wj_embed, dim=1) + wi_bias.squeeze() + wj_bias.squeeze() #Dot product 
        
    def loss(self, cock_mat, predictions):
        logXij = torch.where(cock_mat > 0, torch.log(cock_mat), torch.zeros_like(cock_mat))
        #weight f(xij) as defined in paper
        weight_term = torch.where(cock_mat < self.x_max, torch.pow(cock_mat / self.x_max, self.alpha), torch.ones_like(cock_mat))

        loss = weight_term * torch.pow(predictions - logXij, 2)
        return torch.sum(loss)

In [131]:
# Create the GloVe model
model = GloVeModel(vocab_size, embedding_dim)

# Adagrad optimizer
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(epochs):
    total_loss = 0
    for target_idx in range(vocab_size):
        for context_idx in range(vocab_size):
            cooc = cock_mat[target_idx, context_idx]
            if cooc > 0:  # Only process pairs with non-zero co-occurrence
                wi = torch.tensor([target_idx], dtype=torch.long)
                wj = torch.tensor([context_idx], dtype=torch.long)
                cooc = torch.tensor([cooc], dtype=torch.float)
                
                optimizer.zero_grad()
                
                # Forward pass
                predictions = model(wi, wj)
                
                # Calculate loss
                loss = model.loss(cooc, predictions)
                total_loss += loss.item()
                
                # Backward pass and optimize
                loss.backward()
                optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss}")

Epoch 1/20, Loss: 5045.591790393148
Epoch 2/20, Loss: 1128.8292956238424
Epoch 3/20, Loss: 445.07131927004826
Epoch 4/20, Loss: 231.5430280246482
Epoch 5/20, Loss: 132.1775304393899
Epoch 6/20, Loss: 79.60237106857802
Epoch 7/20, Loss: 49.69493308339192
Epoch 8/20, Loss: 31.855333866887353
Epoch 9/20, Loss: 20.84425056023504
Epoch 10/20, Loss: 13.868710841579349
Epoch 11/20, Loss: 9.35738230683056
Epoch 12/20, Loss: 6.389841038519024
Epoch 13/20, Loss: 4.409705105416914
Epoch 14/20, Loss: 3.0720452898829893
Epoch 15/20, Loss: 2.1585620649652704
Epoch 16/20, Loss: 1.5286779960898715
Epoch 17/20, Loss: 1.090509531907611
Epoch 18/20, Loss: 0.7832397443558189
Epoch 19/20, Loss: 0.5661462359258103
Epoch 20/20, Loss: 0.41169094105608744


We can use the trained embeddings to get the top similar words


In [143]:

def get_similar_words(word, top_n=5):
    # Get the word index
    word_idx = word_to_idx.get(word.lower())
    if word_idx is None:
        print(f"'{word}' not found in vocabulary.")
        return []

    # Get the embedding of the word
    word_embedding = model.word_embeddings(torch.tensor([word_idx], dtype=torch.long))
    word_bias = model.bias_wi(torch.tensor([word_idx], dtype=torch.long))

    # Compute cosine similarity for all words
    similarities = []
    for idx in range(vocab_size):
        # Check if the index exists in idx_to_word
        if idx not in idx_to_word:
            continue  # Skip invalid indices
        
        # Get the embedding of each word
        context_embedding = model.word_embeddings(torch.tensor([idx], dtype=torch.long))
        context_bias = model.bias_wi(torch.tensor([idx], dtype=torch.long))

        # Calculate cosine similarity between the word and the context word
        similarity = torch.cosine_similarity(word_embedding, context_embedding, dim=1)
        similarity = similarity.item()  # Convert to scalar

        similarities.append((idx_to_word[idx], similarity, context_bias.item()))
    
    # Sort the similar words based on similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Get top_n most similar words
    similar_words = [(word, sim, bias) for word, sim, bias in similarities[:top_n]]
    return similar_words

similar_words = get_similar_words("god", top_n=5)
for word, sim, bias in similar_words:
    print(f"Word: {word}, Similarity: {sim:.4f}, Bias: {bias:.4f}")

Word: god, Similarity: 1.0000, Bias: 0.4564
Word: night, Similarity: 0.1613, Bias: 0.0191
Word: darkness, Similarity: 0.1409, Bias: 0.0385
Word: moved, Similarity: 0.1212, Bias: -0.7573
Word: earth, Similarity: 0.1166, Bias: -0.1654


# References
1. Pennington, J., Socher, R., & Manning, C. (2014). Glove: Global vectors for word representation. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP). https://doi.org/10.3115/v1/d14-1162 - https://nlp.stanford.edu/pubs/glove.pdf
2. https://www.foldl.me/2014/glove-python
3. https://github.com/hans/glove.py/blob/master/glove.py
4. https://nlp.stanford.edu/projects/glove/
5 .https://github.com/noaRricky/pytorch-glove
6. ChatGPT