In [None]:
"""
Question 1: Compute Scaled Dot-Product Attention (Python)

This implementation computes scaled dot-product attention given Q, K, and V matrices.
Formula: Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
"""

import numpy as np

def scaled_dot_product_attention(Q, K, V):
    """
    Compute scaled dot-product attention.

    Args:
        Q: Query matrix of shape (batch_size, seq_len_q, d_k)
        K: Key matrix of shape (batch_size, seq_len_k, d_k)
        V: Value matrix of shape (batch_size, seq_len_v, d_v)

    Returns:
        context: Context vector of shape (batch_size, seq_len_q, d_v)
        attention_weights: Attention weights of shape (batch_size, seq_len_q, seq_len_k)
    """
    # Step 1: Get the dimension d_k from the key matrix
    d_k = K.shape[-1]

    # Step 2: Compute attention scores: QK^T
    # Using matrix multiplication (@ operator or np.matmul)
    scores = np.matmul(Q, K.transpose(0, 2, 1))  # Shape: (batch_size, seq_len_q, seq_len_k)

    # Step 3: Scale by sqrt(d_k)
    scaled_scores = scores / np.sqrt(d_k)

    # Step 4: Apply softmax to get attention weights
    # Softmax is applied along the last dimension (over keys)
    attention_weights = softmax(scaled_scores)

    # Step 5: Compute context vector by weighted sum of values
    context = np.matmul(attention_weights, V)  # Shape: (batch_size, seq_len_q, d_v)

    return attention_weights, context


def softmax(x):
    """
    Compute softmax along the last dimension.
    Uses numerical stability trick: subtract max before exp.

    Args:
        x: Input array of any shape

    Returns:
        Softmax probabilities with same shape as input
    """
    # Subtract max for numerical stability
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


# ============= EXAMPLE USAGE AND TESTING =============

if __name__ == "__main__":
    print("="*60)
    print("Testing Scaled Dot-Product Attention")
    print("="*60)

    # Set random seed for reproducibility
    np.random.seed(42)

    # Example 1: Single sequence
    print("\n--- Example 1: Single Sequence ---")
    batch_size = 1
    seq_len = 4
    d_k = 8  # dimension of keys/queries
    d_v = 8  # dimension of values

    Q = np.random.randn(batch_size, seq_len, d_k)
    K = np.random.randn(batch_size, seq_len, d_k)
    V = np.random.randn(batch_size, seq_len, d_v)

    attention_weights, context = scaled_dot_product_attention(Q, K, V)

    print(f"Input shapes:")
    print(f"  Q: {Q.shape}")
    print(f"  K: {K.shape}")
    print(f"  V: {V.shape}")
    print(f"\nOutput shapes:")
    print(f"  Attention weights: {attention_weights.shape}")
    print(f"  Context vector: {context.shape}")
    print(f"\nAttention weights (first sequence):")
    print(attention_weights[0])
    print(f"\nSum of attention weights per row (should be ~1.0):")
    print(np.sum(attention_weights[0], axis=-1))

    # Example 2: Batch of sequences
    print("\n" + "="*60)
    print("--- Example 2: Batch of Sequences ---")
    batch_size = 3
    seq_len = 5
    d_k = 16
    d_v = 16

    Q = np.random.randn(batch_size, seq_len, d_k)
    K = np.random.randn(batch_size, seq_len, d_k)
    V = np.random.randn(batch_size, seq_len, d_v)

    attention_weights, context = scaled_dot_product_attention(Q, K, V)

    print(f"Input shapes:")
    print(f"  Q: {Q.shape}")
    print(f"  K: {K.shape}")
    print(f"  V: {V.shape}")
    print(f"\nOutput shapes:")
    print(f"  Attention weights: {attention_weights.shape}")
    print(f"  Context vector: {context.shape}")

    # Example 3: Cross-attention (different sequence lengths)
    print("\n" + "="*60)
    print("--- Example 3: Cross-Attention (Q and K have different lengths) ---")
    batch_size = 2
    seq_len_q = 3  # Query sequence length
    seq_len_k = 5  # Key/Value sequence length
    d_k = 8
    d_v = 8

    Q = np.random.randn(batch_size, seq_len_q, d_k)
    K = np.random.randn(batch_size, seq_len_k, d_k)
    V = np.random.randn(batch_size, seq_len_k, d_v)

    attention_weights, context = scaled_dot_product_attention(Q, K, V)

    print(f"Input shapes:")
    print(f"  Q: {Q.shape} (queries from decoder)")
    print(f"  K: {K.shape} (keys from encoder)")
    print(f"  V: {V.shape} (values from encoder)")
    print(f"\nOutput shapes:")
    print(f"  Attention weights: {attention_weights.shape}")
    print(f"  Context vector: {context.shape}")

    print("\n" + "="*60)
    print("All tests completed successfully!")
    print("="*60)

Testing Scaled Dot-Product Attention

--- Example 1: Single Sequence ---
Input shapes:
  Q: (1, 4, 8)
  K: (1, 4, 8)
  V: (1, 4, 8)

Output shapes:
  Attention weights: (1, 4, 4)
  Context vector: (1, 4, 8)

Attention weights (first sequence):
[[0.08431243 0.25513027 0.51521078 0.14534652]
 [0.64059204 0.1332861  0.01664257 0.2094793 ]
 [0.47006414 0.08789379 0.11121405 0.33082801]
 [0.17794451 0.49185018 0.20052305 0.12968226]]

Sum of attention weights per row (should be ~1.0):
[1. 1. 1. 1.]

--- Example 2: Batch of Sequences ---
Input shapes:
  Q: (3, 5, 16)
  K: (3, 5, 16)
  V: (3, 5, 16)

Output shapes:
  Attention weights: (3, 5, 5)
  Context vector: (3, 5, 16)

--- Example 3: Cross-Attention (Q and K have different lengths) ---
Input shapes:
  Q: (2, 3, 8) (queries from decoder)
  K: (2, 5, 8) (keys from encoder)
  V: (2, 5, 8) (values from encoder)

Output shapes:
  Attention weights: (2, 3, 5)
  Context vector: (2, 3, 8)

All tests completed successfully!


In [None]:
"""
Question 2: Implement Simple Transformer Encoder Block (PyTorch)

This implementation includes:
- Multi-head self-attention layer
- Feed-forward network (2 linear layers with ReLU)
- Add & Norm layers (residual connections + layer normalization)

Parameters: d_model = 128, num_heads = 8
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class MultiHeadSelfAttention(nn.Module):
    """
    Multi-head self-attention mechanism.
    Splits input into multiple heads, applies scaled dot-product attention,
    then concatenates and projects back.
    """
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Dimension per head

        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Output projection
        self.W_o = nn.Linear(d_model, d_model)

    def split_heads(self, x):
        """
        Split the last dimension into (num_heads, d_k).

        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)

        Returns:
            Tensor of shape (batch_size, num_heads, seq_len, d_k)
        """
        batch_size, seq_len, d_model = x.size()
        # Reshape and transpose
        x = x.view(batch_size, seq_len, self.num_heads, self.d_k)
        return x.transpose(1, 2)  # (batch_size, num_heads, seq_len, d_k)

    def combine_heads(self, x):
        """
        Combine heads back together.

        Args:
            x: Tensor of shape (batch_size, num_heads, seq_len, d_k)

        Returns:
            Tensor of shape (batch_size, seq_len, d_model)
        """
        batch_size, num_heads, seq_len, d_k = x.size()
        x = x.transpose(1, 2).contiguous()  # (batch_size, seq_len, num_heads, d_k)
        return x.view(batch_size, seq_len, self.d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        """
        Compute scaled dot-product attention.

        Args:
            Q, K, V: Tensors of shape (batch_size, num_heads, seq_len, d_k)
            mask: Optional mask tensor

        Returns:
            context: Tensor of shape (batch_size, num_heads, seq_len, d_k)
            attention_weights: Tensor of shape (batch_size, num_heads, seq_len, seq_len)
        """
        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Apply softmax
        attention_weights = F.softmax(scores, dim=-1)

        # Compute context
        context = torch.matmul(attention_weights, V)

        return context, attention_weights

    def forward(self, x, mask=None):
        """
        Forward pass of multi-head attention.

        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
            mask: Optional attention mask

        Returns:
            output: Tensor of shape (batch_size, seq_len, d_model)
        """
        # Linear projections
        Q = self.W_q(x)  # (batch_size, seq_len, d_model)
        K = self.W_k(x)
        V = self.W_v(x)

        # Split into multiple heads
        Q = self.split_heads(Q)  # (batch_size, num_heads, seq_len, d_k)
        K = self.split_heads(K)
        V = self.split_heads(V)

        # Apply attention
        context, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads
        context = self.combine_heads(context)  # (batch_size, seq_len, d_model)

        # Final linear projection
        output = self.W_o(context)

        return output


class FeedForwardNetwork(nn.Module):
    """
    Position-wise feed-forward network.
    Two linear transformations with ReLU activation in between.
    """
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        Forward pass: Linear -> ReLU -> Linear

        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)

        Returns:
            Output tensor of shape (batch_size, seq_len, d_model)
        """
        return self.linear2(F.relu(self.linear1(x)))


class TransformerEncoderBlock(nn.Module):
    """
    Single transformer encoder block with:
    - Multi-head self-attention
    - Feed-forward network
    - Residual connections and layer normalization
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderBlock, self).__init__()

        # Multi-head attention
        self.attention = MultiHeadSelfAttention(d_model, num_heads)

        # Feed-forward network
        self.ffn = FeedForwardNetwork(d_model, d_ff)

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        """
        Forward pass with residual connections and layer normalization.

        Architecture:
        1. x -> Multi-head Attention -> Add & Norm
        2. x -> Feed-Forward -> Add & Norm

        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
            mask: Optional attention mask

        Returns:
            Output tensor of shape (batch_size, seq_len, d_model)
        """
        # Multi-head attention with residual connection and layer norm
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))  # Add & Norm

        # Feed-forward network with residual connection and layer norm
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))  # Add & Norm

        return x


# ============= TESTING AND VERIFICATION =============

if __name__ == "__main__":
    print("="*70)
    print("Testing Transformer Encoder Block")
    print("="*70)

    # Initialize dimensions as specified
    d_model = 128
    num_heads = 8
    d_ff = 512  # Typical: 4 * d_model
    dropout = 0.1

    print(f"\nModel Configuration:")
    print(f"  d_model: {d_model}")
    print(f"  num_heads: {num_heads}")
    print(f"  d_ff: {d_ff}")
    print(f"  d_k (per head): {d_model // num_heads}")

    # Create the encoder block
    encoder_block = TransformerEncoderBlock(d_model, num_heads, d_ff, dropout)

    # Print model architecture
    print(f"\n{'='*70}")
    print("Model Architecture:")
    print(f"{'='*70}")
    print(encoder_block)

    # Count parameters
    total_params = sum(p.numel() for p in encoder_block.parameters())
    print(f"\n{'='*70}")
    print(f"Total Parameters: {total_params:,}")
    print(f"{'='*70}")

    # Test with batch of 32 sentences, each with 10 tokens
    print(f"\n{'='*70}")
    print("Verification Test: Batch of 32 sentences, 10 tokens each")
    print(f"{'='*70}")

    batch_size = 32
    seq_len = 10

    # Create random input (simulating token embeddings)
    x = torch.randn(batch_size, seq_len, d_model)

    print(f"\nInput shape: {x.shape}")
    print(f"  Batch size: {batch_size}")
    print(f"  Sequence length: {seq_len}")
    print(f"  Model dimension: {d_model}")

    # Set model to evaluation mode for testing
    encoder_block.eval()

    # Forward pass
    with torch.no_grad():
        output = encoder_block(x)

    print(f"\nOutput shape: {output.shape}")

    # Verify shape
    expected_shape = (batch_size, seq_len, d_model)
    assert output.shape == expected_shape, f"Shape mismatch! Expected {expected_shape}, got {output.shape}"

    print(f"\n✓ Shape verification passed!")
    print(f"  Expected: {expected_shape}")
    print(f"  Got: {output.shape}")

    # Additional statistics
    print(f"\n{'='*70}")
    print("Output Statistics:")
    print(f"{'='*70}")
    print(f"  Mean: {output.mean().item():.6f}")
    print(f"  Std: {output.std().item():.6f}")
    print(f"  Min: {output.min().item():.6f}")
    print(f"  Max: {output.max().item():.6f}")

    # Test individual components
    print(f"\n{'='*70}")
    print("Testing Individual Components:")
    print(f"{'='*70}")

    # Test Multi-Head Attention
    attention = MultiHeadSelfAttention(d_model, num_heads)
    with torch.no_grad():
        attn_output = attention(x)
    print(f"\n✓ Multi-Head Attention output shape: {attn_output.shape}")

    # Test Feed-Forward Network
    ffn = FeedForwardNetwork(d_model, d_ff)
    with torch.no_grad():
        ffn_output = ffn(x)
    print(f"✓ Feed-Forward Network output shape: {ffn_output.shape}")

    # Test with different batch sizes and sequence lengths
    print(f"\n{'='*70}")
    print("Testing with Various Input Sizes:")
    print(f"{'='*70}")

    test_cases = [
        (1, 5),      # Single sentence, 5 tokens
        (16, 20),    # 16 sentences, 20 tokens
        (64, 50),    # 64 sentences, 50 tokens
    ]

    for batch, seq in test_cases:
        test_input = torch.randn(batch, seq, d_model)
        with torch.no_grad():
            test_output = encoder_block(test_input)
        print(f"  Input: ({batch}, {seq}, {d_model}) -> Output: {test_output.shape} ✓")

    print(f"\n{'='*70}")
    print("All tests completed successfully!")
    print(f"{'='*70}")

Testing Transformer Encoder Block

Model Configuration:
  d_model: 128
  num_heads: 8
  d_ff: 512
  d_k (per head): 16

Model Architecture:
TransformerEncoderBlock(
  (attention): MultiHeadSelfAttention(
    (W_q): Linear(in_features=128, out_features=128, bias=True)
    (W_k): Linear(in_features=128, out_features=128, bias=True)
    (W_v): Linear(in_features=128, out_features=128, bias=True)
    (W_o): Linear(in_features=128, out_features=128, bias=True)
  )
  (ffn): FeedForwardNetwork(
    (linear1): Linear(in_features=128, out_features=512, bias=True)
    (linear2): Linear(in_features=512, out_features=128, bias=True)
  )
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

Total Parameters: 198,272

Verification Test: Batch of 32 sentences, 10 tokens each

Input shape: torch.Size([32, 10, 128])
  Batch size: 32
  Sequence length: 10
  Model dimension: 128

Outp