# Demonstrating Scaled Dot-Product Attention & Self-Attention in Transformers

![Description](Scaled_Dot_Product_Attention.png)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import random
random.seed(42)  # Python random seed
torch.manual_seed(42)  # PyTorch seed (CPU)

<torch._C.Generator at 0x7fed7f73f970>

In [3]:
# Set print options: No scientific notation, 2 decimal places
torch.set_printoptions(sci_mode=False, precision=4)

# Task : Implementing Self-Attention
In this task, you will implement a Self-Attention mechanism, a fundamental building block of the Transformer model. Self-attention allows a model to weigh the importance of different words in a sequence when encoding contextual information.

Your goal is to define a SelfAttention class in PyTorch, which will:

Initialize Linear Layers: Learnable transformations for query (Q), key (K), and value (V) projections.

Compute Attention Scores: Measure how much focus each word should give to others using dot-product attention.

Apply Softmax Scaling: Normalize attention scores to get attention weights.

Generate Contextual Output: Multiply attention weights with value vectors to obtain the final representation.

In [4]:
class SelfAttention(nn.Module):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.d_k = d_model
        self.d_v = d_model

        ### BEGIN SOLUTION
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        ### END SOLUTION   
        
    def forward(self, q_input, k_input, v_input):

        Q = self.query(q_input)
        K = self.key(k_input)
        V = self.value(v_input)
        
        ### BEGIN SOLUTION
        attn_scores = torch.matmul(Q, K.T) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        attn_weights = F.softmax(attn_scores, dim=-1)
        attention_output = torch.matmul(attn_weights, V)
        ### END SOLUTION   
        
        return attention_output

In [5]:
### BEGIN HIDDEN TESTS

import torch
import torch.nn as nn
import torch.nn.functional as F

def test_self_attention():
    d_model_t = 16   # Feature dimension
    seq_len_t = 2    # Sequence length

    # Initialize the SelfAttention module
    self_attn_t = SelfAttention(d_model_t)

    # Create random inputs of shape (seq_len, d_model)
    q_input_t = torch.randn(seq_len_t, d_model_t)
    k_input_t = torch.randn(seq_len_t, d_model_t)
    v_input_t = torch.randn(seq_len_t, d_model_t)

    # Forward pass
    attention_output_t = self_attn_t(q_input_t, k_input_t, v_input_t)

    # Assertions
    assert attention_output_t.shape == (seq_len_t, d_model_t), "Output shape mismatch!"
    assert not torch.isnan(attention_output_t).any(), "NaN values in output!"
    assert torch.isfinite(attention_output_t).all(), "Non-finite values in output!"

    print("All test cases passed!")

# Run the test
test_self_attention()

### END HIDDEN TESTS

All test cases passed!


In [6]:
d_model=6
max_sequence_length = 4
src_tokens = torch.randn(max_sequence_length, d_model) * 10.0

In [7]:
self_attention = SelfAttention(d_model)
result = self_attention.forward(src_tokens, src_tokens, src_tokens)

In [8]:
result

tensor([[ -2.4043,  -1.2668,   3.1681,   5.2007,  -4.8781,  -3.4479],
        [ -1.9338,   3.2367,  -1.3393,  -3.8062,  -0.8477,   4.8810],
        [ -2.4011,  -1.2671,   3.1729,   5.2080,  -4.8893,  -3.4525],
        [ -0.8060,  -1.1678,   5.2952,   8.3120, -10.1812,  -5.2827]],
       grad_fn=<MmBackward0>)