## Word Embeddings

Goal: Implement a simple word embedding in Python (from scratch) and use it to find the most similar words to a given word. Come up with a dataset and evaluation metrics to evaluate the word embeddings.

In [7]:
import jax
import jax.numpy as jnp
import numpy as np
from matplotlib import pyplot as plt

Importing NLTK's stopwords

In [9]:
import nltk
import string

nltk.download("stopwords")
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\okk15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using Brown Corpus

In [10]:
nltk.download("brown")
from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\okk15\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [11]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [14]:
corpus = brown.sents(categories="news")

In [40]:
# lower case all words, remove all punctuations and stop words
def preprocess_sentence(sentence):
    return [word.lower() for word in sentence if word.lower() not in stopwords and word not in punctuation]

corpus_preprocessed = [preprocess_sentence(sentence) for sentence in corpus]
corpus_preprocessed

[['fulton',
  'county',
  'grand',
  'jury',
  'said',
  'friday',
  'investigation',
  "atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'evidence',
  "''",
  'irregularities',
  'took',
  'place'],
 ['jury',
  'said',
  'term-end',
  'presentments',
  'city',
  'executive',
  'committee',
  'over-all',
  'charge',
  'election',
  '``',
  'deserves',
  'praise',
  'thanks',
  'city',
  'atlanta',
  "''",
  'manner',
  'election',
  'conducted'],
 ['september-october',
  'term',
  'jury',
  'charged',
  'fulton',
  'superior',
  'court',
  'judge',
  'durwood',
  'pye',
  'investigate',
  'reports',
  'possible',
  '``',
  'irregularities',
  "''",
  'hard-fought',
  'primary',
  'mayor-nominate',
  'ivan',
  'allen',
  'jr.'],
 ['``',
  'relative',
  'handful',
  'reports',
  'received',
  "''",
  'jury',
  'said',
  '``',
  'considering',
  'widespread',
  'interest',
  'election',
  'number',
  'voters',
  'size',
  'city',
  "''"],
 ['jury',
  'said',
  'f

### Building Co-occurrence Matrix

In [70]:
from collections import defaultdict
import numpy as np

# Build vocabulary index
def build_vocab_idx(corpus_preprocessed):
    vocab_count = defaultdict(int)
    for sentence in corpus_preprocessed:
        for word in sentence:
            vocab_count[word] += 1
    return {word: idx for idx, (word, _) in enumerate(vocab_count.items())}, vocab_count

vocab_idx, vocab_count = build_vocab_idx(corpus_preprocessed)
vocab_size = len(vocab)

# Window size for Co-occurrence matrix - 2 words before, 2 words after
window_size = 2

# Build co-occurrence matrix
def build_cooccurrence_matrix(corpus_preprocessed, vocab_idx, window_size):
    cooccurrence_matrix = defaultdict(lambda: defaultdict(float))
    
    for sentence in corpus_preprocessed:
        sentence_length = len(sentence)
        for i, word in enumerate(sentence):
            word_idx = vocab_idx[word]
            # Context window
            start = max(0, i - window_size)
            end = min(sentence_length, i + window_size + 1)
            
            for j in range(start, end):
                # Skip target word
                if i != j:
                    context_word = sentence[j]
                    context_word_idx = vocab_idx[context_word]
                    # Increment the co-occurrence count with inverse distance weighting
                    cooccurrence_matrix[word_idx][context_word_idx] += 1.0 / abs(i - j)
    return cooccurrence_matrix

cooccurrence_matrix = build_cooccurrence_matrix(corpus_preprocessed, vocab, window_size)
cooccurrence_matrix[0]

defaultdict(float,
            {1: 6.0,
             2: 0.5,
             3: 0.5,
             33: 1.0,
             34: 2.0,
             35: 1.5,
             65: 1.0,
             66: 2.0,
             67: 0.5,
             75: 0.5,
             28: 1.0,
             76: 0.5,
             141: 0.5,
             142: 1.0,
             143: 0.5,
             130: 0.5,
             150: 1.0,
             151: 0.5,
             163: 0.5,
             164: 1.0,
             165: 0.5,
             171: 0.5,
             14: 1.0,
             172: 1.0,
             116: 0.5,
             73: 1.0,
             173: 1.0,
             231: 0.5,
             235: 1.0,
             230: 0.5,
             12: 0.5,
             257: 0.5,
             100: 1.5,
             258: 1.0,
             259: 0.5,
             265: 0.5,
             266: 1.0,
             267: 1.0,
             277: 0.5,
             271: 1.0})

### GloVe: Global Vectors for Word Representation