In [None]:
# Import libraries:
# - numpy/pandas: array and table utilities
# - tensorflow/keras: tokenization, sequences, one-hot labels, and model layers
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda


In [None]:
# Create a small sample corpus (text) and split it into tokens (words)
# You can replace this with your own sentences
# dl_data will be a list of tokens (strings)
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

In [None]:
# Tokenize the corpus and prepare mappings between words and ids
# Tokenizer builds a vocabulary and assigns an id to each unique token
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

# Add a PAD token (id=0) to pad context windows to a fixed length
word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}

# Convert each token to its id (each 'doc' here is a single token from dl_data)
wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in dl_data]

# Hyperparameters
vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])


Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [None]:
# Build (context, target) training pairs for CBOW
# For each center word, take 'window_size' words on the left and right as context
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2  # total number of context words
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            # collect neighbors around the center index (skip the center itself)
            context_words = [words[i] for i in range(index - window_size, index + window_size + 1)
                             if i != index and 0 <= i < sentence_length]
            label_word = [word]  # the center word is the target
            # pad context to fixed length and one-hot encode the target
            x = pad_sequences([context_words], maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)

# Preview: print a few generated (context, target) pairs
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    # Optionally skip samples that include PAD (id 0)
    if 0 not in x[0]:  # Example filter (remove PAD if desired)
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argmax(y[0])])
        i += 1
    if i == 10:
        break


In [None]:
# Define a simple CBOW model:
# - Embedding: look up dense vectors for each context word id
# - Lambda(mean): average the context embeddings to get one vector
# - Dense + softmax: predict the target word distribution over the vocabulary
import tensorflow as tf

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

# Train with categorical crossentropy and RMSProp optimizer
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# Build and show the model summary
cbow.build(input_shape=(None, window_size * 2))
cbow.summary()


In [None]:
# Train the CBOW model with a simple manual loop
for epoch in range(1, 6):
    loss = 0.0
    i = 0
    # Stream (context, target) pairs from the generator
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        # One mini-update per pair
        loss += cbow.train_on_batch(x, y)
        # Optional progress print for very large corpora
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 430.84808

Epoch: 2 	Loss: 430.3429

Epoch: 3 	Loss: 428.4553

Epoch: 4 	Loss: 426.678

Epoch: 5 	Loss: 425.2011



In [None]:
# Extract the learned word embeddings from the model
weights = cbow.get_weights()[0]

# Remove the PAD row (index 0) so it aligns with words starting from id=1
weights = weights[1:]
print(weights.shape)

# Show a small table of embeddings: rows=words, columns=embedding dimensions
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.053811,-0.021965,-0.031946,0.025899,0.02587,-0.011971,0.017166,-0.033597,0.037855,-0.028029,...,0.028481,0.059729,-0.01522,0.031792,0.011434,0.030792,4.7e-05,0.011492,0.003695,-0.038793
networks,0.012211,0.022778,-0.026541,0.045527,0.051432,0.03709,-0.026177,-0.049971,0.034592,-0.008757,...,-0.005389,0.01232,-0.004715,-0.009161,-0.020322,0.061598,0.06492,0.016656,0.00816,-0.036231
neural,0.007575,-0.005314,0.006753,0.020443,-0.004717,-0.004553,0.046599,-0.002272,0.002667,-0.004326,...,-0.012735,0.042071,-0.033299,0.045449,-0.000771,0.009265,-0.014977,0.001748,0.041288,-0.025355
and,0.013518,-0.02483,-0.029084,-0.00671,-0.045961,0.029382,-0.013573,0.010873,-0.038445,-0.035782,...,-0.027289,-0.013834,-0.040606,-0.040892,-0.010782,0.004321,-0.002082,0.031735,0.020708,0.030787
as,-0.04972,0.025355,0.039353,0.017494,-0.044821,-0.004506,0.032198,-0.041671,0.014883,0.009515,...,-0.03163,0.021036,0.012921,-0.013501,0.013961,0.013314,-0.003683,-0.030884,-0.006832,0.021592


In [None]:
# (Empty cell) – use this space for extra experiments or notes