<a href="https://colab.research.google.com/github/jayesh2409/Deep-Learning/blob/main/Continuous%20Bag%20Of%20Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "the mat is soft and warm"
]

# Preprocess text: Tokenization and lowercasing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # +1 for padding

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(corpus)


In [2]:
## Stage b: Generate Training Data
def generate_training_data(sequences, window_size=2):
    contexts = []
    targets = []

    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            contexts.append(context)
            targets.append(target)

    return np.array(contexts), np.array(targets)

X, y = generate_training_data(sequences)

# Pad sequences for consistent input shape
X = pad_sequences(X, maxlen=4)  # Adjust maxlen based on context size


In [4]:
## Stage c: Train Model
# Define CBOW model architecture
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=4))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  # Average embeddings
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100)




Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.8377
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0000e+00 - loss: 2.8345
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.0000e+00 - loss: 2.8313
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0000e+00 - loss: 2.8281
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0000e+00 - loss: 2.8249
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.1250 - loss: 2.8217
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.2500 - loss: 2.8185
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.2500 - loss: 2.8154
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c0fb4723e00>

In [5]:
## Stage d: Output
# Get word embeddings from the trained model
word_embeddings = model.layers[0].get_weights()[0]

# Create a mapping of words to their embeddings
word_index = tokenizer.word_index


print('Vocabulary Size:', len(word_index))
print('Vocabulary Sample:', list(word_index.items())[:10],"\n\n")


embeddings_dict = {word: word_embeddings[idx] for word, idx in word_index.items()}

# Output the embeddings for each word in a structured format
print("{:<10} | {}".format("Word", "Embedding"))
print("-" * 40)
for word, embedding in embeddings_dict.items():
    print("{:<10} | {}".format(word, np.round(embedding, 3)))


Vocabulary Size: 16
Vocabulary Sample: [('the', 1), ('sat', 2), ('on', 3), ('mat', 4), ('and', 5), ('cat', 6), ('dog', 7), ('log', 8), ('cats', 9), ('dogs', 10)] 


Word       | Embedding
----------------------------------------
the        | [ 0.23   0.066 -0.079 -0.001  0.193 -0.031  0.032 -0.303  0.218 -0.257]
sat        | [ 0.13  -0.173 -0.126 -0.167  0.136 -0.146 -0.137 -0.181  0.172 -0.159]
on         | [ 0.179  0.21  -0.185  0.164  0.147  0.219  0.103 -0.149  0.195 -0.134]
mat        | [ 0.08  -0.144  0.105  0.07   0.062 -0.119 -0.19  -0.227 -0.073 -0.179]
and        | [-0.064  0.104  0.003  0.066 -0.167 -0.117  0.058 -0.012 -0.059 -0.187]
cat        | [ 0.139  0.018 -0.146 -0.011  0.168 -0.002 -0.019 -0.175  0.165 -0.159]
dog        | [ 0.121  0.016 -0.104 -0.008  0.188 -0.063 -0.039 -0.096  0.145 -0.129]
log        | [ 0.14  -0.112 -0.137 -0.062  0.08  -0.096 -0.098 -0.105  0.135 -0.157]
cats       | [-0.135  0.062  0.148  0.079  0.076 -0.08   0.117  0.086  0.065 -0.136]
dogs  