In [1]:
import torch
import torch.nn as nn

In [2]:
import random
random.seed(24)  # Python random seed
torch.manual_seed(24)  # PyTorch seed (CPU)

<torch._C.Generator at 0x7f96343a38d0>

In [3]:
# Set print options: No scientific notation, 2 decimal places
torch.set_printoptions(sci_mode=False, precision=4)

# Machine Translation

In [4]:
# Step 1: Tokenizer and Vocabulary Creation (Add more words if needed)
sentence_en = "I love Dhoni ."
# sentence_fr = "J' adore l'IA ."
sentence_te = "నాకు ధోని అంటే ఇష్టం ."

word_map_en = {"<pad>": 0, "I": 1, "love": 2, "Dhoni": 3, ".": 4}
word_map_te = {"<pad>": 0, "నాకు": 1, "ధోని": 2, "అంటే": 3, "ఇష్టం": 4, ".": 5}

In [5]:
# Tokenizing sentences
def tokenize(sentence, word_map):
    return torch.tensor([word_map[word] for word in sentence.split()])

In [6]:
# Tokenize the input and target sentences
input_tensor = tokenize(sentence_en, word_map_en).unsqueeze(0)  # Shape (1, 4)
target_tensor = tokenize(sentence_te, word_map_te).unsqueeze(0)  # Shape (1, 6)

# Task: Implement the PositionalEncoding class

## Define the PositionalEncoding Class
Create a class that inherits from nn.Module.

Initialize the embedding dimension (d_model) and the maximum sequence length (max_sequence_length).

## Initialize the Positional Encoding Matrix
Create a tensor of shape (max_sequence_length, d_model), filled with zeros.

This will store the sinusoidal positional encodings.

## Compute the Denominators for Sine and Cosine Functions
Generate indices i for half the embedding dimension (d_model//2).

Compute the denominator using the formula:
$$
PE_{(pos, 2i)} = \sin\bigg(\frac{pos}{10000^{\frac{2i}{d_{model}}}}\bigg)
$$

$$
PE_{(pos, 2i+1)} = \cos\bigg(\frac{pos}{10000^{\frac{2i}{d_{model}}}}\bigg)
$$

## Generate Token Positions
Create a column vector of token positions (token_pos) with shape (max_sequence_length, 1).

## Apply Sine and Cosine Functions
Sine values are assigned to even indices (0, 2, 4, ...).

Cosine values are assigned to odd indices (1, 3, 5, ...).

## Add a Batch Dimension
Reshape the positional encoding to support batch processing.

## Implement the forward Method
Extract the relevant positional encodings for the input sequence (x).

Add the positional encodings to the input sequence (x).

Return the modified input.

In [7]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        ### BEGIN SOLUTION
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
        self.PE = torch.zeros(max_sequence_length, d_model)

        i = torch.arange(0, self.d_model//2).float()
        denominator = torch.pow(10000, 2*i/self.d_model)
        token_pos = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        self.PE[:, 0::2] = torch.sin(token_pos / denominator)
        self.PE[:, 1::2] = torch.cos(token_pos / denominator)
        
        self.PE = self.PE.unsqueeze(0)  # Add batch dimension
        ### END SOLUTION
    def forward(self, x):
        ### BEGIN SOLUTION
        pos_enc = self.PE[:, :x.size(1)]
        # print("\nPositional Encoding Values:")
        # print(pos_enc)

        final_output = x + pos_enc
        # print("\nFinal Input After Adding Positional Encoding:")
        # print(final_output)
        ### END SOLUTION
        return final_output

In [8]:
### BEGIN HIDDEN TESTS

def test_positional_encoding():
    d_model = 6
    max_sequence_length = 10
    pos_enc_t = PositionalEncoding(d_model, max_sequence_length)

    # Test 1: Shape correctness
    x = torch.randn(1, max_sequence_length, d_model)
    output = pos_enc_t(x)
    assert output.shape == x.shape, "Test 1 Failed: Output shape mismatch!"

    # Test 2: Ensure positional encoding is added
    x_zero = torch.zeros(1, max_sequence_length, d_model)  # Zero input tensor
    output_zero = pos_enc_t(x_zero)
    assert not torch.allclose(output_zero, x_zero), "Test 2 Failed: Positional encoding was not added!"

    # Test 3: Batch processing
    x_batch = torch.randn(2, max_sequence_length, d_model)  # Batch of size 2
    output_batch = pos_enc_t(x_batch)
    assert output_batch.shape == x_batch.shape, "Test 3 Failed: Batch processing failed!"

    # Test 4: Handling partial sequences
    x_short = torch.randn(1, 6, d_model)  # Sequence shorter than max length
    output_short = pos_enc_t(x_short)
    assert output_short.shape == x_short.shape, "Test 4 Failed: Positional encoding does not support shorter sequences!"

    print("All tests passed successfully! ")

# Run the test
test_positional_encoding()

### END HIDDEN TESTS

All tests passed successfully! 


# Task: Implement a Transformer model that combines word embeddings with positional encoding to capture both word meaning and order in a sequence

## Initialize Embedding Layer

Use nn.Embedding to map words (tokens) to dense vector representations of size d_model.

## Integrate Positional Encoding

create a PositionalEncoding instance to incorporate position information.

## Define the Forward Pass

Convert input tokens into embeddings using self.embedding(src).

Pass the embeddings through self.pos_encoder to add positional information.

In [9]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=10):
        super(Transformer, self).__init__()
        ### BEGIN SOLUTION
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        ### END SOLUTION
    def forward(self, src):
        ### BEGIN SOLUTION
        src_embedding = self.embedding(src)
        # print(f"\nSrc Embedding Values: {src_embedding}")
        input_embedding = self.pos_encoder(src_embedding)
        ### END SOLUTION
        return input_embedding

In [10]:
### BEGIN HIDDEN TESTS

def test_transformer():
    vocab_size = 20  # Example vocabulary size
    d_model = 6  # Embedding dimension
    max_len = 10  # Maximum sequence length
    
    transformer_t = Transformer(vocab_size, d_model, max_len)  # Instance renamed to transformer_t

    # Test case 1: Single sequence of token indices
    input_tensor_1 = torch.tensor([[1, 2, 3, 4, 5]])  # A sample input sequence
    output_1 = transformer_t(input_tensor_1)
    assert output_1.shape == (1, 5, d_model), "Test Case 1 Failed: Output shape mismatch"

    # Test case 2: Multiple sequences (batch processing)
    input_tensor_2 = torch.tensor([[3, 4, 5, 6, 7], [2, 3, 4, 5, 6]])
    output_2 = transformer_t(input_tensor_2)
    assert output_2.shape == (2, 5, d_model), "Test Case 2 Failed: Output shape mismatch"

    # Test case 3: Edge case - using the highest index in vocabulary
    input_tensor_3 = torch.tensor([[vocab_size - 1, vocab_size - 2, vocab_size - 3]])
    output_3 = transformer_t(input_tensor_3)
    assert output_3.shape == (1, 3, d_model), "Test Case 3 Failed: Output shape mismatch"

    print("All test cases passed!")

# Run the test function
test_transformer()

### END HIDDEN TESTS

All test cases passed!


In [11]:
#  Initialize Model
vocab_size_en = len(word_map_en)
vocab_size_te = len(word_map_te)
d_model = 8
max_len = 10

In [12]:
transformer = Transformer(vocab_size_en, d_model, max_len)

In [13]:
# transformer
output = transformer(input_tensor)

In [14]:
output

tensor([[[ 0.7510, -1.0548,  0.1096, -0.5704,  1.9079, -0.6084, -1.2071,
           1.0335],
         [ 0.1277,  0.1987, -0.0825, -0.2313, -0.9619,  1.6389,  0.1670,
           2.0120],
         [ 1.3652,  0.1752,  1.2235,  1.0922,  2.6103,  1.8005, -1.2510,
           1.1712],
         [ 0.7820, -0.5253,  0.3549,  1.3561,  1.0577,  2.1537, -0.0899,
           1.3271]]], grad_fn=<AddBackward0>)