## Word Embeddings

Goal: Implement a simple word embedding in Python (from scratch) and use it to find the most similar words to a given word. Come up with a dataset and evaluation metrics to evaluate the word embeddings.

In [2]:
import jax
import jax.numpy as jnp
import numpy as np
from matplotlib import pyplot as plt

### Downloading NLTK's stopwords

In [33]:
import nltk
# nltk.download("stopwords") # Uncomment this line to download the stopwords
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
print(stopwords)

{"she's", 'herself', 'further', 're', 'whom', 'because', 'didn', 'such', 'his', 'out', 'those', 'doesn', 'm', 'all', 'be', 'each', 'theirs', 'the', 'while', 'and', "it's", 'been', 'them', 'under', 't', 'just', 'him', 'does', 'there', 'they', "hadn't", 'itself', 'o', 'over', 'below', 'than', 'if', 'what', 's', 'against', "hasn't", 'during', 'mightn', 'can', 'have', 'hadn', 'ma', 'don', 'through', 'once', 'in', 'only', "couldn't", 'why', 'yourself', 'between', "won't", 'himself', 'more', 'hasn', 'into', 'with', 'shan', 'when', 'up', 'where', "needn't", 'who', 'about', 'should', 'doing', 'to', 'both', 'he', 'off', 'some', "mustn't", 'needn', 'aren', 'so', 'an', 'had', 'y', 'few', "haven't", 'down', "shouldn't", 'ours', 'own', 'i', 'me', "didn't", 'now', 'too', 'then', 'from', 'its', 'again', "weren't", 'shouldn', 'd', 'which', 'ourselves', 'wasn', 'hers', 'or', 'will', 've', "should've", 'very', 'but', 'her', 'at', 'our', 'it', 'wouldn', 'for', 'ain', 'until', 'their', 'a', 'my', 'that', 

### Downloading Brown Corpus

The Brown Corpus was the first million-word electronic corpus of English, created in 1961 at Brown University. This corpus contains text from 500 sources, and the sources have been categorized by genre, such as news, editorial, and so on.

In [6]:
# nltk.download("brown") # Uncomment this line to download the brown corpus
from nltk.corpus import brown

In [34]:
# Show the categories of the brown corpus 
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [35]:
# Get the sentences from the "fiction" category
sentences = brown.sents(categories="fiction")

# Number of sentences
print(f"Number of sentences: {len(sentences)}")

# Print the first 5 sentences
for i in range(5):
    print(f"Sentence {i}: {sentences[i]}")

Number of sentences: 4249
Sentence 0: ['Thirty-three']
Sentence 1: ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.']
Sentence 2: ['His', 'parents', 'talked', 'seriously', 'and', 'lengthily', 'to', 'their', 'own', 'doctor', 'and', 'to', 'a', 'specialist', 'at', 'the', 'University', 'Hospital', '--', 'Mr.', 'McKinley', 'was', 'entitled', 'to', 'a', 'discount', 'for', 'members', 'of', 'his', 'family', '--', 'and', 'it', 'was', 'decided', 'it', 'would', 'be', 'best', 'for', 'him', 'to', 'take', 'the', 'remainder', 'of', 'the', 'term', 'off', ',', 'spend', 'a', 'lot', 'of', 'time', 'in', 'bed', 'and', ',', 'for', 'the', 'rest', ',', 'do', 'pretty', 'much', 'as', 'he', 'chose', '--', 'provided', ',', 'of', 'course', ',', 'he', 'chose', 'to', 'do', 'nothing', 'too', 'exciting', 'or', 'too', 'debilitating', '.']
Sentence 3: ['His', 'teacher', 'and', 'his', 'school', 'principal', 'were', 'conferred', 'with', 'and', 'everyone', 'agreed', 'that', ',', 'if', 'he', 'kept', 'up', 'with', '

In [36]:
# Filter out non-alphanumeric words and convert to lowercase
filtered_sentences = []

for sent in sentences:
    filtered_sent = [word.lower() for word in sent if word.isalnum()]
    filtered_sentences.append(filtered_sent)

# Print the first 5 sentences
for i in range(5):
    print(f"Filtered sentence {i}: {filtered_sentences[i]}")

Filtered sentence 0: []
Filtered sentence 1: ['scotty', 'did', 'not', 'go', 'back', 'to', 'school']
Filtered sentence 2: ['his', 'parents', 'talked', 'seriously', 'and', 'lengthily', 'to', 'their', 'own', 'doctor', 'and', 'to', 'a', 'specialist', 'at', 'the', 'university', 'hospital', 'mckinley', 'was', 'entitled', 'to', 'a', 'discount', 'for', 'members', 'of', 'his', 'family', 'and', 'it', 'was', 'decided', 'it', 'would', 'be', 'best', 'for', 'him', 'to', 'take', 'the', 'remainder', 'of', 'the', 'term', 'off', 'spend', 'a', 'lot', 'of', 'time', 'in', 'bed', 'and', 'for', 'the', 'rest', 'do', 'pretty', 'much', 'as', 'he', 'chose', 'provided', 'of', 'course', 'he', 'chose', 'to', 'do', 'nothing', 'too', 'exciting', 'or', 'too', 'debilitating']
Filtered sentence 3: ['his', 'teacher', 'and', 'his', 'school', 'principal', 'were', 'conferred', 'with', 'and', 'everyone', 'agreed', 'that', 'if', 'he', 'kept', 'up', 'with', 'a', 'certain', 'amount', 'of', 'work', 'at', 'home', 'there', 'was', 

In [37]:
# Filter out stopwords and non-alphanumeric words and convert to lowercase
filtered_stopwords_sentences = []

for sent in filtered_sentences:
    filtered_sent = [word.lower() for word in sent if word.isalnum() and word.lower() not in stopwords]
    filtered_stopwords_sentences.append(filtered_sent)

# Print the first 5 sentences
for i in range(5):
    print(f"Filtered sentence {i}: {filtered_stopwords_sentences[i]}")

Filtered sentence 0: []
Filtered sentence 1: ['scotty', 'go', 'back', 'school']
Filtered sentence 2: ['parents', 'talked', 'seriously', 'lengthily', 'doctor', 'specialist', 'university', 'hospital', 'mckinley', 'entitled', 'discount', 'members', 'family', 'decided', 'would', 'best', 'take', 'remainder', 'term', 'spend', 'lot', 'time', 'bed', 'rest', 'pretty', 'much', 'chose', 'provided', 'course', 'chose', 'nothing', 'exciting', 'debilitating']
Filtered sentence 3: ['teacher', 'school', 'principal', 'conferred', 'everyone', 'agreed', 'kept', 'certain', 'amount', 'work', 'home', 'little', 'danger', 'losing', 'term']
Filtered sentence 4: ['scotty', 'accepted', 'decision', 'indifference', 'enter', 'arguments']


### Word2Vec Word Embedding

#### Skip-gram model

##### Skip-Gram with Negative Sampling (SGNS)

#####  Skip-Gram with Hierarchical Softmax

#### Continuous Bag of Words (CBOW) model

#####  CBOW with Negative Sampling


##### CBOW with Hierarchical Softmax