### Continuous Bag of Words(CBOW)

Preprocess the Text

In [7]:
import tensorflow as tf
import numpy as np
import re
from collections import Counter

In [14]:
# Load the Sherlock Holmes text
with open('sherlock_holmes.txt', 'r', encoding="utf-8") as file:
    text = file.read().lower()

# Clean and tokenize the text
def preprocess(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return words

tokens = preprocess(text)

# Create a vocabulary and word-to-index mapping
vocab = Counter(tokens)
vocab_size = len(vocab)
print(vocab_size)

# Map words to indices and vice versa
word2idx = {word: idx for idx, word in enumerate(vocab)}
# print(word_to_idx)
idx2word = {idx: word for idx, word in enumerate(vocab)}
# print(idx_to_word)
# print("Vocabulary:", vocab)
# print("Word to Index:", word_to_idx)


8699


Prepare Training Data

In [13]:
# Define context window size
window_size = 2

# Generate context-target pair
data = []
for i in range(window_size, len(tokens) - window_size):
    context = [tokens[i - j] for j in range(1, window_size + 1)]
    context += [tokens[i + j] for j in range(1, window_size + 1)]
    target = tokens[i]
    data.append((context, target))

# print("Context-Target Pairs:", data)

Prepare Training Data

In [15]:
# Define the context window size
window_size = 2

def generate_context_target_pairs(words, window_size):
    data = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        data.append((context, target))
    return data

data = generate_context_target_pairs(tokens, window_size)

# Convert words to their indices
def words_to_indices(data, word2idx):
    context_indices = []
    target_indices = []
    for context, target in data:
        context_indices.append([word2idx[word] for word in context])
        target_indices.append(word2idx[target])
    return np.array(context_indices), np.array(target_indices)

context_indices, target_indices = words_to_indices(data, word2idx)


Build the CBOW Model

In [19]:
embedding_dim = 100  # You can change the embedding size

# Define the model
class CBOW(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        x = self.embeddings(inputs)
        x = tf.reduce_mean(x, axis=1)  # Average the context embeddings
        return self.dense(x)

model = CBOW(vocab_size, embedding_dim)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


Train the Model

In [20]:
# Train the model
model.fit(context_indices, target_indices, epochs=10, batch_size=128)


Epoch 1/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 249ms/step - accuracy: 0.0565 - loss: 7.6789
Epoch 2/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 237ms/step - accuracy: 0.0816 - loss: 6.2904
Epoch 3/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 203ms/step - accuracy: 0.0978 - loss: 6.0069
Epoch 4/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 299ms/step - accuracy: 0.1148 - loss: 5.7807
Epoch 5/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 273ms/step - accuracy: 0.1350 - loss: 5.5385
Epoch 6/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 221ms/step - accuracy: 0.1509 - loss: 5.3590
Epoch 7/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 213ms/step - accuracy: 0.1682 - loss: 5.1639
Epoch 8/10
[1m840/840[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 229ms/step - accuracy: 0.1817 - loss: 5.0294
Epoch 9/

<keras.src.callbacks.history.History at 0x232bed3d650>

Test the Model with New Inputs

In [25]:
def predict_target(context_words, model, word2idx, idx2word):
    context_indices = np.array([[word2idx[word] for word in context_words]])
    prediction = model.predict(context_indices)
    predicted_word_idx = np.argmax(prediction)
    return idx2word[predicted_word_idx]

# Test example: predict the word in the middle of the context
test_context = ['brow', 'dog', 'quick', 'over'] # ['quick', 'brown', 'fox', 'jumps']  # Example context
predicted_word = predict_target(test_context, model, word2idx, idx2word)
print(f"Predicted word: {predicted_word}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
Predicted word: his
