In [None]:
# a. Data Preparation
import pandas as pd
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical, pad_sequences
import numpy as np
#for building CBOW model
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

In [None]:
data = [
    "Natural Language Processing is a field of Artificial Intelligence.",
    "Word embeddings help computers understand human language.",
    "The CBOW model is a part of Word2Vec technique.",
    "CBOW predicts the target word using surrounding context words.",
    "Skip Gram is another architecture of Word2Vec.",
    "Word2Vec is widely used in NLP applications.",
    "Embedding layers in deep learning are used to represent words.",
    "CBOW is faster and works better with frequent words."
]

In [None]:
#Tokenize and build vocabulary
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)

word2id = tokenizer.word_index
word2id['PAD'] = 0   # padding token
id2word = {v: k for k, v in word2id.items()}

# Convert sentences into sequences of IDs
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2  # context window size

print("Vocabulary Size:", vocab_size)
print("Sample Vocabulary:", list(word2id.items())[:10])

Vocabulary Size: 50
Sample Vocabulary: [('is', 1), ('of', 2), ('cbow', 3), ('word2vec', 4), ('words', 5), ('language', 6), ('a', 7), ('word', 8), ('the', 9), ('used', 10)]


In [None]:
# Generate training data (context -> target)
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            # pick context (excluding target word)
            context_words.append([words[i]
                                  for i in range(start, end)
                                  if 0 <= i < sentence_length and i != index])
            label_word.append(word)

            # pad context & one-hot target
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)

In [None]:
# Show few examples
i = 0
for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
    if 0 not in x[0]:  # skip padded ones
        print("Context (X):", [id2word[w] for w in x[0]], "-> Target (Y):", id2word[np.argmax(y[0])])
        i += 1
        if i == 5:
            break

Context (X): ['natural', 'language', 'is', 'a'] -> Target (Y): processing
Context (X): ['language', 'processing', 'a', 'field'] -> Target (Y): is
Context (X): ['processing', 'is', 'field', 'of'] -> Target (Y): a
Context (X): ['is', 'a', 'of', 'artificial'] -> Target (Y): field
Context (X): ['a', 'field', 'artificial', 'intelligence'] -> Target (Y): of


In [None]:
#Build CBOW model
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_shape=(window_size*2,)))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))
cbow.compile(loss="categorical_crossentropy", optimizer="adam")

print(cbow.summary())

  super().__init__(**kwargs)


None


In [None]:
#Train Model
for epoch in range(1, 10):  # run fewer epochs for demo
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
        loss += cbow.train_on_batch(x, y)
        i += 1
    print("Epoch:", epoch, "Loss:", loss)

Epoch: 1 Loss: 262.29654
Epoch: 2 Loss: 261.1473
Epoch: 3 Loss: 259.29797
Epoch: 4 Loss: 257.23203
Epoch: 5 Loss: 254.8435
Epoch: 6 Loss: 252.0664
Epoch: 7 Loss: 248.87202
Epoch: 8 Loss: 245.26855
Epoch: 9 Loss: 241.29422


In [None]:
#Save trained word embeddings to a file
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(49, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,-0.112843,-0.076374,0.103728,-0.074037,0.022961,0.229079,0.081067,0.232911,-0.130624,-0.044777,...,0.123222,0.002553,0.136428,0.101167,-0.101709,0.059215,0.126466,-0.02219,0.153879,-0.110381
cbow,-0.106229,-0.044315,-0.026503,-0.217257,0.089675,0.126166,0.194157,-0.00208,0.026144,-0.100506,...,0.034981,0.016064,0.132949,0.129231,-0.107518,0.199874,-0.053173,0.159456,0.060617,0.035335
word2vec,-0.065577,-0.09274,-0.073537,0.057295,-0.084892,0.138809,-0.062212,0.196231,-0.19489,-0.083713,...,0.138474,0.005089,-0.123522,0.070336,-0.006558,-0.037518,-0.077625,-0.062837,-0.108105,-0.168468
words,-0.032096,0.073591,0.012096,-0.039886,0.18101,-0.055978,0.191558,-0.063688,-0.02868,-0.100517,...,0.157514,0.150699,-0.045216,0.091428,0.097726,-0.052283,0.008223,0.035641,0.03183,-0.151587
language,-0.099747,0.151102,-0.151527,0.13246,-0.023889,-0.040634,0.111453,-0.116119,0.043751,-0.034269,...,0.023968,-0.076894,-0.154938,-0.10019,0.103279,0.001107,-0.104661,0.011534,-0.149296,-0.041034


In [None]:
#Find similar words using Euclidean distance
distance_matrix = euclidean_distances(weights)

similar_words = {
    search: [id2word[idx] for idx in distance_matrix[word2id[search]-1].argsort()[1:6]+1]
    for search in ["deep", "cbow"]
}

print("Similar Words:", similar_words)

Similar Words: {'deep': ['embedding', 'used', 'learning', 'applications', 'in'], 'cbow': ['model', 'skip', 'gram', 'part', 'target']}
