# nn.Embedding in PyTorch
## nn.Embedding is a lookup table that maps discrete indices (e.g., words, tokens, or categories) to continuous vector representations (embeddings). It is commonly used in NLP and recommendation systems.

### nn.Embedding(num_embeddings, embedding_dim, padding_idx=None,..)

### Parameters
num_embeddings: Total number of possible indices (e.g., vocabulary size).

embedding_dim: Size of each embedding vector.

padding_idx (optional): Specifies an index (e.g., 0) whose embedding remains zero during training.

In [1]:
import torch
import torch.nn as nn

In [2]:
# Example vocabulary
vocab = {"hello": 0, "world": 1, "this": 2, "is": 3, "a": 4, "test": 5}

# Define an embedding layer with vocab size 6 and embedding dimension 3

In [3]:
# Define an embedding layer with vocab size 6 and embedding dimension 3
embedding_layer = nn.Embedding(num_embeddings=6, embedding_dim=3)

# Setence_1 = "this is a test"
# Convert words to indices

In [4]:
# Sentence/Sequence
# Setence_1 = "this is a test"
# Convert words to indices
Setence_1_indices = torch.tensor([2, 3, 4, 5])  # "this", "is", "a", "test"

# Get the embeddings for these words

In [5]:
# Get the embeddings for these words
embeddings = embedding_layer(Setence_1_indices)

print(embeddings)

tensor([[-1.2431, -1.0259, -1.0153],
        [ 0.3792,  0.7361,  0.4511],
        [-1.1624, -0.3854, -0.2508],
        [-0.8085, -0.2324,  1.5288]], grad_fn=<EmbeddingBackward0>)


# Repeat for Setence_2 = "hello world"

In [6]:
# Sentence ya Sequence
# Setence_2 = "hello world"
# Convert words to indices
Setence_2_indices = torch.tensor([0, 1])  # "hello", "world"

# Get the embeddings for these words
embeddings = embedding_layer(Setence_2_indices)

print(embeddings)

tensor([[ 1.2121, -1.1920, -0.4230],
        [ 0.0727,  0.8886,  1.2703]], grad_fn=<EmbeddingBackward0>)


# Example vocabulary including a padding token (index 0)
# The padding token will be represented by index 0 in the vocabulary.

In [7]:
# Example vocabulary including a padding token (index 0)
# The padding token will be represented by index 0 in the vocabulary.
vocab = {"hello": 1, "world": 2, "this": 3, "is": 4, "a": 5, "test": 6, "<pad>": 0}

# Define an embedding layer with vocab size 7 (6 words + 1 padding token) and embedding dimension 3

In [8]:
# Define an embedding layer with vocab size 7 (6 words + 1 padding token) and embedding dimension 3
embedding_layer = nn.Embedding(num_embeddings=7, embedding_dim=3, padding_idx=0)

# Define two sequences of different lengths

In [9]:
# Define two sequences of different lengths
sequence_1 = [3, 4, 5, 6]  # Corresponds to ["this", "is", "a", "test"]
sequence_2 = [1, 2]        # Corresponds to ["hello", "world"]

# Define the fixed length for padding
# Pad the sequences to the fixed length using the <pad> token (index 0)

In [10]:
# Define the fixed length for padding
fixed_length = 5

# Pad the sequences to the fixed length using the <pad> token (index 0)
# The pad token is used to make both sequences of length `fixed_length`
padded_sequence_1 = sequence_1 + [0] * (fixed_length - len(sequence_1))  # Pad sequence_1
padded_sequence_2 = sequence_2 + [0] * (fixed_length - len(sequence_2))  # Pad sequence_2

In [11]:
print(f" {padded_sequence_1}")
print(f" {padded_sequence_2}")

 [3, 4, 5, 6, 0]
 [1, 2, 0, 0, 0]


# Convert sequences to tensors

In [12]:
# Convert sequences to tensors
indices_1 = torch.tensor(padded_sequence_1)  # Sequence 1 after padding
indices_2 = torch.tensor(padded_sequence_2)  # Sequence 2 after padding

# Get the embeddings for these padded sequences

In [13]:
# Get the embeddings for these padded sequences
embeddings_1 = embedding_layer(indices_1)  # Embeddings for sequence 1
embeddings_2 = embedding_layer(indices_2)  # Embeddings for sequence 2

# Print the embeddings

In [14]:
# Print the embeddings
print("Embeddings for sequence 1:")
print(embeddings_1)

print("\nEmbeddings for sequence 2:")
print(embeddings_2)

Embeddings for sequence 1:
tensor([[ 1.5508, -1.2233, -1.7642],
        [ 0.3967, -0.3848,  0.1251],
        [ 1.7091,  0.1736, -1.7899],
        [ 0.1929, -0.0116, -0.7650],
        [ 0.0000,  0.0000,  0.0000]], grad_fn=<EmbeddingBackward0>)

Embeddings for sequence 2:
tensor([[-0.8736,  0.6329,  1.5281],
        [ 0.0240,  0.3783,  1.7073],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000]], grad_fn=<EmbeddingBackward0>)
