In [1]:
from tensorflow.keras.preprocessing.text import one_hot


In [2]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
## Define vocabulary size

voc_size=10000


In [5]:
## One-Hot representation

one_hot_rep=[one_hot(words,voc_size) for words in sent]
one_hot_rep

## instead of a vector representaion with one place as 1 and rest all 0s based on the vocabulary, just represent as indexes where that particular word is 1


[[3937, 3714, 6083, 147],
 [3937, 3714, 6083, 5889],
 [3937, 7167, 6083, 9761],
 [1085, 5446, 9290, 5445, 7213],
 [1085, 5446, 9290, 5445, 5452],
 [2223, 3937, 9275, 6083, 7619],
 [4085, 8623, 4911, 5445]]

In [6]:
## word embedding representation

from tensorflow.keras.layers import Embedding 
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential


In [7]:
import numpy as np

set_len=8 ## fixing the length of all sentences as number of inputs are fixed for all timestamps
embedded_docs=pad_sequences(one_hot_rep,padding='pre',maxlen=set_len) ## pre-padding --> add zeros to the front of all sentences so that all have equal lengths for RNN processing
embedded_docs



array([[   0,    0,    0,    0, 3937, 3714, 6083,  147],
       [   0,    0,    0,    0, 3937, 3714, 6083, 5889],
       [   0,    0,    0,    0, 3937, 7167, 6083, 9761],
       [   0,    0,    0, 1085, 5446, 9290, 5445, 7213],
       [   0,    0,    0, 1085, 5446, 9290, 5445, 5452],
       [   0,    0,    0, 2223, 3937, 9275, 6083, 7619],
       [   0,    0,    0,    0, 4085, 8623, 4911, 5445]], dtype=int32)

In [8]:
## feature representation
dim=10  ## dim -- dimension -- number of features to be taken into acccount


In [15]:
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=set_len))
model.build(input_shape=(None, set_len))
model.compile('adam','mse')

In [16]:
model.summary()

In [None]:
# 🔹 Embedding Layer in RNN 🔹
# - Like starting with a BLANK notebook filled with random scribbles.
# - As the RNN trains on YOUR dataset (say movie review classification),
#   the notebook gets rewritten so that useful words move closer together.
#   Example: "great", "awesome", "fantastic" end up close in meaning.
# - It only learns what helps YOUR task, so it’s very focused.
# - Downside: it won’t understand broad language meanings outside your dataset.
# - Analogy: Your personal exam notes → perfect for YOUR exam, but not for everyone else.

# 🔹 CBOW Word2Vec 🔹
# - Like a GIANT dictionary built from reading millions of books.
# - Learns by predicting missing words from surrounding context
#   (e.g., "the cat ___ on the mat" → "sat").
# - Captures general word relationships:
#   "doctor" ≈ "nurse", "king - man + woman ≈ queen".
# - It’s general-purpose and reusable across many tasks.
# - Analogy: Oxford Dictionary → not tied to your exam, but useful for many situations.

# 🔹 Why the instructor did NOT use Word2Vec in the RNN 🔹
# - The instructor wanted to show how an Embedding layer can learn from scratch
#   directly on the given dataset.
# - Word2Vec is PRETRAINED and requires extra steps:
#   (download big embeddings, align vocab, load into Keras).
# - For teaching/demo purposes, it’s simpler to just use Keras’ Embedding layer,
#   which automatically learns task-specific embeddings during training.
# - Also, if the dataset is large enough, the RNN’s own embeddings are often good enough,
#   so there is no strict need to start with pretrained Word2Vec.

# ✅ Both Embedding Layer and Word2Vec are based on the same core idea:
# - Represent each word as a dense vector (instead of sparse one-hot).
# - Have a big weight matrix of size (vocab_size × embedding_dim).
# - Adjust that matrix during training so that words get meaningful vectors.

# 🔹 BUT the difference lies in HOW and WHY they are trained:
# - Word2Vec (CBOW/Skip-gram):
#   → Trains ONLY to predict context words.
#   → Goal = build a universal "dictionary of meanings".
#   → Result = general-purpose embeddings (usable anywhere).
#
# - Embedding Layer inside RNN:
#   → Trains along with the main task (e.g., sentiment classification).
#   → Goal = adjust word vectors so they help minimize YOUR loss.
#   → Result = task-specific embeddings (good for your dataset/task,
#              may not generalize outside it).



In [17]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step


array([[[ 0.01364317,  0.04685089, -0.01951307,  0.01807373,
          0.02826028,  0.036609  , -0.02721768, -0.03781943,
         -0.01946548,  0.03209307],
        [ 0.01364317,  0.04685089, -0.01951307,  0.01807373,
          0.02826028,  0.036609  , -0.02721768, -0.03781943,
         -0.01946548,  0.03209307],
        [ 0.01364317,  0.04685089, -0.01951307,  0.01807373,
          0.02826028,  0.036609  , -0.02721768, -0.03781943,
         -0.01946548,  0.03209307],
        [ 0.01364317,  0.04685089, -0.01951307,  0.01807373,
          0.02826028,  0.036609  , -0.02721768, -0.03781943,
         -0.01946548,  0.03209307],
        [ 0.00782322, -0.01799371, -0.00418096, -0.04039834,
          0.00074538,  0.0371852 , -0.01971728, -0.04057891,
          0.03323558, -0.00467954],
        [ 0.03180038,  0.01359086, -0.03352475, -0.02035583,
         -0.01909379, -0.040486  ,  0.01094516, -0.03321411,
         -0.03610901,  0.02524419],
        [ 0.0035449 , -0.00838593,  0.02310022,  0.0