<h1>New</h1>

In [1]:
# Fixed minimal CBOW example
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
import tensorflow.keras.backend as K


In [2]:
# Small corpus
sentences = ["i love deep learning", "deep learning is fun", "i love fun", "i love funny peoples","i love deep learning so much" ]



In [3]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word2idx = tokenizer.word_index
vocab_size = len(word2idx) + 1   # +1 for padding/index-0 (not used here)
print("Vocabulary:", word2idx)


Vocabulary: {'i': 1, 'love': 2, 'deep': 3, 'learning': 4, 'fun': 5, 'is': 6, 'funny': 7, 'peoples': 8, 'so': 9, 'much': 10}


In [4]:
# Generate training pairs: for each center word, every surrounding context word -> (context, target)
window = 2
contexts = []
targets = []

for sentence in sentences:
    tokens = sentence.split()
    for idx, center_word in enumerate(tokens):
        start = max(idx - window, 0)
        end = min(idx + window, len(tokens) - 1)
        for i in range(start, end + 1):
            if i == idx:
                continue
            context_word = tokens[i]
            contexts.append(word2idx[context_word])   # context token id (input)
            targets.append(word2idx[center_word])     # target token id (label)

# Convert to arrays
X = np.array(contexts)        # shape (N,)
y = np.array(targets)         # shape (N,)

# IMPORTANT: Embedding expects 2D input (batch, sequence_length)
# we used sequence_length=1, so reshape X -> (N,1)
X = X.reshape(-1, 1)          # now shape (N,1)

# One-hot the targets
Y = to_categorical(y, num_classes=vocab_size)  # shape (N, vocab_size)

print("X shape:", X.shape, "Y shape:", Y.shape)


X shape: (54, 1) Y shape: (54, 11)


In [5]:
# Build model
embedding_dim = 5
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
# average over sequence axis (here sequence length = 1, so this just squeezes embedding dim)
model.add(Lambda(lambda x: K.mean(x, axis=1)))   # result shape (batch, embedding_dim)
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train (small epochs is fine for tiny corpus)
model.fit(X, Y, epochs=200, batch_size=8, verbose=0)

# Inspect embedding for a word
word = "deep"
idx = word2idx[word]
emb_matrix = model.get_weights()[0]
print(f"Embedding for '{word}' (index {idx}):\n", emb_matrix[idx])




Embedding for 'deep' (index 3):
 [-0.48143798  0.62404233 -0.21433535  0.6715141  -0.5054749 ]


In [6]:
# Train (small epochs is fine for tiny corpus)
model.fit(X, Y, epochs=200, batch_size=8, verbose=0)


<keras.src.callbacks.history.History at 0x26844f39880>

In [7]:
# Inspect embedding for a word
word = "deep"
idx = word2idx[word]
emb_matrix = model.get_weights()[0]
print(f"Embedding for '{word}' (index {idx}):\n", emb_matrix[idx])

Embedding for 'deep' (index 3):
 [-0.59979194  0.87316525 -0.24417084  0.78998166 -0.5522912 ]


In [8]:
# print some (context -> target) readable pairs
id2word = {v:k for k,v in word2idx.items()}
for i in range(min(10, X.shape[0])):
    context_id = X[i,0]
    target_id = y[i]
    print(f"Context -> Target: {id2word[context_id]}  ->  {id2word[target_id]}")


Context -> Target: love  ->  i
Context -> Target: deep  ->  i
Context -> Target: i  ->  love
Context -> Target: deep  ->  love
Context -> Target: learning  ->  love
Context -> Target: i  ->  deep
Context -> Target: love  ->  deep
Context -> Target: learning  ->  deep
Context -> Target: love  ->  learning
Context -> Target: deep  ->  learning


In [9]:
def predict_similar(word):
    idx = word2idx.get(word)
    if idx is None:
        print("word not in vocab")
        return
    
    # X must be 2D with a padded context (context length = 1 here)
    x = np.array([[idx]])
    
    pred = model.predict(x, verbose=0)[0]    # softmax vector
    best = np.argmax(pred)
    print("Input word:", word)
    print("Most likely predicted word:", id2word[best])


In [10]:
predict_similar("deep")
predict_similar("learning")


Input word: deep
Most likely predicted word: learning
Input word: learning
Most likely predicted word: deep
