In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import requests
import re

In [None]:
# Loading the Shakespeare dataset from Andrej Karpathy's website
def load_shakespeare_corpus():
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    response = requests.get(url)
    text = response.text.lower()  # Lowercase for simplicity
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    corpus = text.split()
    return corpus

In [None]:
# Load and preprocess the Shakespeare corpus:
corpus = load_shakespeare_corpus()

In [None]:
# Check how the dataset looks like
corpus[:10]

['first',
 'citizen',
 'before',
 'we',
 'proceed',
 'any',
 'further',
 'hear',
 'me',
 'speak']

In [None]:
# Word2Vec has two model architectures, continuous bag of words and skip-grams
# Here we use skip-grams
def generate_training_data(corpus, window_size, vocab_size):
    word_to_id = {word: i for i, word in enumerate(sorted(set(corpus)))}
    id_to_word = {i: word for word, i in word_to_id.items()}
    word_ids = [word_to_id[word] for word in corpus]

    contexts = []
    targets = []

    for i, target in enumerate(word_ids):
        start = max(0, i - window_size)
        end = min(len(word_ids), i + window_size + 1)

        for j in range(start, end):
            if i != j:
                contexts.append(word_ids[j])
                targets.append(target)

    contexts = np.array(contexts, dtype=np.int32)
    targets = np.array(targets, dtype=np.int32)

    return contexts, targets, word_to_id, id_to_word

In [None]:
# Generate the skip-gram dataset
window_size = 2
vocab_size = len(set(corpus))

contexts, targets, word_to_id, id_to_word = generate_training_data(
    corpus, window_size, vocab_size
)

# This is how our dataset looks
print("Contexts (encoded):", contexts[:10])
print("Targets (encoded):", targets[:10])
print("Contexts (decoded):", [id_to_word[i] for i in contexts[:10]])
print("Targets (decoded):", [id_to_word[i] for i in targets[:10]])

Contexts (encoded): [ 1953   936  4176   936 12339  4176  1953 12339  8513  1953]
Targets (encoded): [ 4176  4176  1953  1953  1953   936   936   936   936 12339]
Contexts (decoded): ['citizen', 'before', 'first', 'before', 'we', 'first', 'citizen', 'we', 'proceed', 'citizen']
Targets (decoded): ['first', 'first', 'citizen', 'citizen', 'citizen', 'before', 'before', 'before', 'before', 'we']


In [None]:
# Word2Vec is essentially a one-layer network using an embedding layer to
# learn the context of context-target pairs
def create_word2vec_model(vocab_size, embedding_dim):
    model = keras.Sequential(
        [
            keras.layers.Embedding(vocab_size, embedding_dim),
            keras.layers.Reshape((embedding_dim,)),
            keras.layers.Dense(vocab_size, activation="softmax"),
        ]
    )

    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
    return model

In [None]:
# (Training) parameters for our word2vec model
embedding_dim = 10
epochs = 200
batch_size = 1024

model = create_word2vec_model(vocab_size, embedding_dim)

In [None]:
# Train the model
model.fit(contexts, targets, epochs=epochs, batch_size=batch_size)

Epoch 1/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 8.5967
Epoch 2/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 7.0392
Epoch 3/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 6.8597
Epoch 4/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 6.7861
Epoch 5/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 6.7336
Epoch 6/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 6.6896
Epoch 7/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 6.6685
Epoch 8/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 6.6381
Epoch 9/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 6.6187
Epoch 10/200
[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms

<keras.src.callbacks.history.History at 0x7c0e7dd56cd0>

In [None]:
# Get word embeddings by extracting the embedding matrix
embeddings = model.layers[0].get_weights()[0]

In [None]:
# Now that we have trained the embedding model, we can check the most similar words
# for a given input word
def get_most_similar(word, embeddings, word_to_id, id_to_word, top_n=5):
    word_id = word_to_id.get(word)
    if word_id is None:
        return "Word not in vocabulary."

    word_embedding = embeddings[word_id]
    similarities = np.dot(embeddings, word_embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(word_embedding)
    )
    sorted_ids = np.argsort(similarities)[::-1][1 : top_n + 1]
    return [id_to_word[i] for i in sorted_ids]

In [None]:
# Similarly, we can check how similar two given words are (highest similarity possible is 1)
def word_similarity(word1, word2, embeddings, word_to_id):
    word1_id = word_to_id.get(word1)
    word2_id = word_to_id.get(word2)

    if word1_id is None or word2_id is None:
        return "One or both words not in vocabulary."

    word1_embedding = embeddings[word1_id]
    word2_embedding = embeddings[word2_id]

    similarity = np.dot(word1_embedding, word2_embedding) / (
        np.linalg.norm(word1_embedding) * np.linalg.norm(word2_embedding)
    )
    return similarity

In [None]:
# Example: Get the embedding for a given word
sample_word = "king"

sample_word_id = word_to_id[sample_word]
sample_embedding = embeddings[sample_word_id]
print(f"Embedding for '{sample_word}': {sample_embedding}")

Embedding for 'king': [ 0.53898597  1.6234968  -0.42619652 -1.0969392  -0.34923983  1.3874028
 -1.3035297   0.60685015 -0.7991151  -0.16748612]


In [None]:
# Example of getting the most similar words.
print(
    f"Most similar words to 'king': {get_most_similar('king', embeddings, word_to_id, id_to_word)}"
)
print(
    f"Most similar words to 'love': {get_most_similar('love', embeddings, word_to_id, id_to_word)}"
)
print(
    f"Most similar words to 'romeo': {get_most_similar('romeo', embeddings, word_to_id, id_to_word)}"
)
print(
    f"Most similar words to 'juliet': {get_most_similar('juliet', embeddings, word_to_id, id_to_word)}"
)
print(
    f"Most similar words to 'caesar': {get_most_similar('caesar', embeddings, word_to_id, id_to_word)}"
)

Most similar words to 'king': ['exeter', 'northumberland', 'percy', 'warwick', 'parkcorner']
Most similar words to 'love': ['assistant', 'friend', 'beggar', 'pronounced', 'tutord']
Most similar words to 'romeo': ['juliet', 'godo', 'unsatisfied', 'villain', 'mercutio']
Most similar words to 'juliet': ['romeo', 'pilgrim', 'godo', 'unsatisfied', 'mercutio']
Most similar words to 'caesar': ['hidest', 'smutchd', 'commitst', 'goddess', 'tumult']


In [None]:
# Word arithmetic example: "king" - "man" + "woman"
if (
    "king" in word_to_id
    and "man" in word_to_id
    and "woman" in word_to_id
    and "queen" in word_to_id
):
    king_embedding = embeddings[word_to_id["king"]]
    man_embedding = embeddings[word_to_id["man"]]
    woman_embedding = embeddings[word_to_id["woman"]]
    queen_embedding = embeddings[word_to_id["queen"]]
    result_embedding = king_embedding - man_embedding + woman_embedding
    similarity_to_queen = np.dot(result_embedding, queen_embedding) / (
        np.linalg.norm(result_embedding) * np.linalg.norm(queen_embedding)
    )
    print(f"Similarity of ('king' - 'man' + 'woman') to 'queen': {similarity_to_queen}")
else:
    print(
        "One or more of the words 'king', 'man', 'woman', or 'queen' not found in vocabulary."
    )

Similarity of ('king' - 'man' + 'woman') to 'queen': 0.7571030259132385


In [None]:
# Word arithmetic example: "king" - "man" + "woman"
if (
    "king" in word_to_id
    and "man" in word_to_id
    and "woman" in word_to_id
    and "peasant" in word_to_id
):
    king_embedding = embeddings[word_to_id["king"]]
    man_embedding = embeddings[word_to_id["man"]]
    woman_embedding = embeddings[word_to_id["woman"]]
    peasant_embedding = embeddings[word_to_id["peasant"]]
    result_embedding = king_embedding - man_embedding + woman_embedding
    similarity_to_peasant = np.dot(result_embedding, peasant_embedding) / (
        np.linalg.norm(result_embedding) * np.linalg.norm(peasant_embedding)
    )
    print(
        f"Similarity of ('king' - 'man' + 'woman') to 'peasant': {similarity_to_peasant}"
    )
else:
    print(
        "One or more of the words 'king', 'man', 'woman', or 'peasant' not found in vocabulary."
    )

Similarity of ('king' - 'man' + 'woman') to 'peasant': 0.12192746251821518


In [None]:
# A few more examples (just make sure both words are in the training corpus)
word1 = "romeo"
word2 = "juliet"

assert (
    word1 in word_to_id and word2 in word_to_id
), "One or both words not in vocabulary."

word1_embedding = embeddings[word_to_id[word1]]
word2_embedding = embeddings[word_to_id[word2]]

# Similarity
similarity = np.dot(word1_embedding, word2_embedding) / (
    np.linalg.norm(word1_embedding) * np.linalg.norm(word2_embedding)
)
print(f"Similarity between '{word1}' and '{word2}': {similarity}")

Similarity between 'romeo' and 'juliet': 0.9851592183113098
