In [3]:
# Build an Attention Neural Network using PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math

In [18]:
class AttentionNeuralNet(nn.Module):
  def __init__(self, d_model, num_heads):
    super().__init__()
    self.d_model = d_model    # embedding dimension (e.g., 512)
    self.num_heads = num_heads  # number of attention heads (e.g., 8)
    self.head_dim = d_model // num_heads  # dimension per head (e.g., 64)

    # Create the Q, K, V projection layers
    self.q_proj = nn.Linear(d_model, d_model)
    self.k_proj = nn.Linear(d_model, d_model)
    self.v_proj = nn.Linear(d_model, d_model)

    # Final output projection
    self.out_proj = nn.Linear(d_model, d_model)

  # scaled dot product attention
  def attention(self, Q, K, V):
    """
    Q, K, V are expected to be of shape:
      [batch_size, seq_len, d_k]
    or possibly
      [batch_size, num_heads, seq_len, d_k]
    if you’re already doing multi-head splitting.
    """
    d_k = K.shape[-1]
    scores = Q @ K.transpose(-2, -1)
    scores = scores / math.sqrt(d_k)
    attention_weights = F.softmax(scores, dim=-1)
    output = attention_weights @ V

    return output, attention_weights

  def transopse_akv(self, output, attention_weights, batch_size, seq_len, d_model):
    # re-order dimensions back to original
    output = torch.permute(output, (0, 2, 1, 3))
    # reshape the dimensions to "combine" the attention heads outputs
    output = output.reshape(batch_size, seq_len, d_model)
    # attention_weights has shape [batch_size, num_heads, seq_len, seq_len]
    # Average across the heads dimension (dim=1)
    attention_weights = attention_weights.mean(dim=1)

    return output, attention_weights

  def forward(self, x):
    batch_size, seq_len, d_model = x.shape
    Q = self.q_proj(x)
    K = self.k_proj(x)
    V = self.v_proj(x)

    # Reshape to separate the heads
    Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim)
    K = K.view(batch_size, seq_len, self.num_heads, self.head_dim)
    V = V.view(batch_size, seq_len, self.num_heads, self.head_dim)

    # re-order dimensions to be compatible with attention method
    Q = torch.permute(Q, (0, 2, 1, 3))
    K = torch.permute(K, (0, 2, 1, 3))
    V = torch.permute(V, (0, 2, 1, 3))

    output, attention_weights = self.attention(Q, K, V)
    output, attention_weights = self.transopse_akv(output, attention_weights, batch_size, seq_len, d_model)

    output = self.out_proj(output)

    return output, attention_weights


In [21]:
def test_attention_shapes(batch_size=32, seq_len=10, d_model=512, num_heads=8):
    # Create model
    model = AttentionNeuralNet(d_model=d_model, num_heads=num_heads)

    # Create dummy input
    x = torch.randn(batch_size, seq_len, d_model)

    # Forward pass
    output, attention_weights = model(x)

    # Check shapes
    assert output.shape == (batch_size, seq_len, d_model)
    assert attention_weights.shape == (batch_size, seq_len, seq_len)

def test_attention_weights_sum_to_one(batch_size=32, seq_len=10, d_model=512, num_heads=8):
  # Create model
  model = AttentionNeuralNet(d_model=d_model, num_heads=num_heads)

  # Create dummy input
  x = torch.randn(batch_size, seq_len, d_model)

  # Forward pass
  output, attention_weights = model(x)
  assert torch.allclose(attention_weights.sum(dim=-1), torch.ones_like(attention_weights.sum(dim=-1)))

In [25]:
test_attention_shapes()
test_attention_weights_sum_to_one()

In [28]:
def create_sequence_dataset(num_sequences=1000, seq_length=10, d_model=512):
    # Create random input sequences
    X = torch.randn(num_sequences, seq_length, d_model)
    # Create target sequences (initially same as input)
    y = X.clone()

    # For positions 2, 5, 8, etc., make the target the sum of previous two tokens
    for pos in range(2, seq_length, 3):
        y[:, pos] = X[:, pos-1] + X[:, pos-2]

    return X, y

# Let's test the dataset creation
def test_dataset():
    X, y = create_sequence_dataset(num_sequences=5, seq_length=10, d_model=4)
    print("Input shape:", X.shape)
    print("Target shape:", y.shape)

    # Verify the pattern for first sequence
    print("\nFirst sequence, first few dimensions:")
    print("Position 2 should equal sum of positions 0 and 1:")
    print(f"X[0, 0]: {X[0, 0][:2]}")  # First token
    print(f"X[0, 1]: {X[0, 1][:2]}")  # Second token
    print(f"y[0, 2]: {y[0, 2][:2]}")  # Third token (should be sum)

# Training loop
def train_attention_model(model, num_epochs=10):
    X_train, y_train = create_sequence_dataset()
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        optimizer.zero_grad()
        output, _ = model(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 2 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

In [33]:
batch_size=32
seq_len=10
d_model=512
num_heads=8
model = AttentionNeuralNet(d_model=d_model, num_heads=num_heads)
train_attention_model(model, num_epochs=50)

Epoch 0, Loss: 1.3141
Epoch 2, Loss: 1.2966
Epoch 4, Loss: 1.2799
Epoch 6, Loss: 1.2622
Epoch 8, Loss: 1.2419
Epoch 10, Loss: 1.2172
Epoch 12, Loss: 1.1863
Epoch 14, Loss: 1.1490
Epoch 16, Loss: 1.1069
Epoch 18, Loss: 1.0641
Epoch 20, Loss: 1.0253
Epoch 22, Loss: 0.9915
Epoch 24, Loss: 0.9599
Epoch 26, Loss: 0.9270
Epoch 28, Loss: 0.8917
Epoch 30, Loss: 0.8549
Epoch 32, Loss: 0.8181
Epoch 34, Loss: 0.7823
Epoch 36, Loss: 0.7476
Epoch 38, Loss: 0.7134
Epoch 40, Loss: 0.6789
Epoch 42, Loss: 0.6442
Epoch 44, Loss: 0.6099
Epoch 46, Loss: 0.5767
Epoch 48, Loss: 0.5448


In [36]:
def analyze_model(model, seq_length=10, d_model=512):
    # Create a test sequence
    X_test = torch.randn(1, seq_length, d_model)
    y_test = X_test.clone()
    for pos in range(2, seq_length, 3):
        y_test[:, pos] = X_test[:, pos-1] + X_test[:, pos-2]

    # Get model predictions and attention weights
    with torch.no_grad():
        pred, attention_weights = model(X_test)

    # Calculate prediction error
    mse = nn.MSELoss()(pred, y_test)
    print(f"Test MSE: {mse.item():.4f}")

    # Analyze attention patterns
    print("\nAttention patterns for summed positions:")
    for pos in range(2, seq_length, 3):
        print(f"\nPosition {pos} attention weights:")
        print(attention_weights[0, pos, pos-2:pos+1])  # Show attention to previous tokens

def analyze_predictions(model, seq_length=10, d_model=512):
    X_test = torch.randn(1, seq_length, d_model)
    y_test = X_test.clone()

    # Create expected sums
    for pos in range(2, seq_length, 3):
        y_test[:, pos] = X_test[:, pos-1] + X_test[:, pos-2]

    with torch.no_grad():
        pred, _ = model(X_test)

    # Compare predictions with expected sums
    for pos in range(2, seq_length, 3):
        expected_sum = X_test[0, pos-2] + X_test[0, pos-1]
        print(f"\nPosition {pos}:")
        print(f"Expected sum: {expected_sum[:5]}")  # Show first 5 dimensions
        print(f"Prediction:   {pred[0, pos][:5]}")

In [37]:
analyze_model(model)

Test MSE: 0.9548

Attention patterns for summed positions:

Position 2 attention weights:
tensor([0.0551, 0.0303, 0.6893])

Position 5 attention weights:
tensor([0.0646, 0.0256, 0.6734])

Position 8 attention weights:
tensor([0.0313, 0.0477, 0.5942])


In [38]:
analyze_predictions(model)


Position 2:
Expected sum: tensor([-3.8203, -0.2196, -0.1141, -0.7116,  0.2415])
Prediction:   tensor([-0.2672, -0.2304,  0.3185,  0.7512,  1.3425])

Position 5:
Expected sum: tensor([ 0.1566, -0.0233,  0.6566, -1.4544, -2.7014])
Prediction:   tensor([-1.0759, -1.0394,  0.6194, -1.0045,  0.4681])

Position 8:
Expected sum: tensor([ 0.1154, -0.6889,  1.8166, -0.6379, -1.6967])
Prediction:   tensor([-0.3267, -0.0108, -0.0664, -0.3468, -0.2812])


# NOTES

Input embedding dimension is the embedding of each token's embedding matrix that comes into the attention layer. -> Split words into tokens and each token is converted to an embedding matrix that numerically represents what the word is in some language embedding space.

Attention weights have a different dimension than the outputs.

For the attention weights:

They come from the scores calculation: scores = Q @ K.transpose(-2, -1)

* Q shape: [batch_size, num_heads, seq_len, head_dim]
* K.transpose shape: [batch_size, num_heads, head_dim, seq_len]
* When you multiply these, you get: [batch_size, num_heads, seq_len, seq_len]

The key difference is that attention weights represent how much each token attends to every other token.

Attention Weights:

These tell you HOW MUCH each token should pay attention to every other token
They are probabilities (sum to 1) showing the relative importance of each token relationship
Shape: [batch_size, num_heads, seq_len, seq_len]
Example: If token 1's attention weights are [0.7, 0.2, 0.1], it means it's paying 70% attention to token 1, 20% to token 2, and 10% to token 3

Context Vectors:

These are the actual NEW REPRESENTATIONS of each token after applying the attention
They contain the weighted combination of information from all tokens based on the attention weights
Shape: [batch_size, num_heads, seq_len, head_dim]
Example: If V contains token representations [v1, v2, v3], and attention weights are [0.7, 0.2, 0.1], the context vector would be 0.7v1 + 0.2v2 + 0.1*v3

In simpler terms:

Attention weights tell you "what to focus on"
Context vectors are "what you learned" after focusing on those things

Think of it like reading a book:

Attention weights are like highlighting parts of text (70% highlighted here, 20% there, etc.)
Context vectors are the actual information you extracted after considering all those highlighted parts together


Seq1Seq models use encoders and decoders. The encoder processes each item in the input sequence, it compiles the information it captures into a vector (called the context). After processing the entire input sequence, the encoder sends the context over to the decoder, which begins producing the output sequence item by item.

Word embeddings turn words/tokens into a vector that capture a lot of the meaning/semantic information of the words.


# notes from Dive Into Deep Learning Chapter on Attention & Transformers

https://d2l.ai/chapter_attention-mechanisms-and-transformers/index.html

In sequence to sequence models, some input sequences may be longer than others. For this reason, we must use token padding to artificially treat all inputs as the same length. To do this we can use a "masked softmax" where values beyond the valid lengths for each pair of vectors are all masked as zero.


Multihead attention -> All heads operate on the same input in parallel but with different learned weights. This parallelism makes the model more expressive, as it’s learning multiple attention “patterns” at once.

Had a question regarding how Q, K, and V matrices are actually learned. Answer from Claude:

The key insight is that the Q, K, V matrices aren't directly supervised (we never tell them "this is what a good query looks like"). Instead, they learn useful transformations because:

* If Q and K matrices learn to produce vectors that give high dot products for related tokens
* And V matrices learn to produce useful value representations
* Then the final output will better predict the next token
* Which reduces the loss

## Notes from the Illustrated Transformer Blog

https://jalammar.github.io/illustrated-transformer/

What is the difference between self attention and encoder-decoder attention?

From Grok3:

Self-Attention: Focuses on relationships within one sequence (input for the encoder, output for the decoder). It’s about internal context. In the decoder, it’s masked to enforce sequential generation.

Encoder-Decoder Attention: Focuses on relationships between two sequences—the input (from the encoder) and the output (being generated by the decoder). It’s about alignment and translation relevance.

### Attention:
**As the model processes each word (each position in the input sequence), self attention allows it to look at other positions in the input sequence for clues that can help lead to a better encoding for this word.**
* What are Query, Key, and Value vectors? They are just abstractions that are useful for calculating and thinking about attention.
* Calculating an Attention Score is scoring each word of the input sequence against the current word the model is analyzing
* The score determines how much emphasis to put on other parts of the input sentence as we encode a word at a certain position
* Score is calculated by taking dot product of the **query vector** and **key vector** of the respective word we're scoring
    * based on previous reading, during training the model learns to adjust the query and key matrices so that this operation returns a result that will tell it to attend to parts of the sentence that help it predict the correct answer!
    * Given the input "Thinking machines!" -> the score for how much "Thinking" should attend to itself is q1 * k1, the score for how much it should attend to "Machines" is q1*k2
    * From Grok: **The dot product measures similarity: a higher dot product means the query and key vectors are more aligned, indicating that the word associated with that key is more relevant to the word being processed.**
    * **Through gradient descent, the model tweaks these matrices so that the dot products (q · k) yield higher scores for word pairs that are contextually relevant and lower scores for irrelevant ones.**

* Next step is to divide the scores by the square root of the dimension of the key vectors to normalize them.
* Then, we pass the results through a softmax function to ensure they are all positive and sum to 1.
* After scaling (to avoid large values) and softmax normalization, these scores become the attention weights.
* The softmax score determines how much each word will be expressed at this position
* Next step is to multiple the values vectors by the softmax scores. Want to drown out values of irrelevant words by multiplying them by very small numbers (close to zero)
* Next step is to sum up the values of the weighted vectors. This produces the output of the self-attention layer at this position (for the first word).
* This resulting vector is the result of the self-attention mechanism and can then be passed on wards to the feed forward layer.

### Matrix Calculation of Self-Attention

1. Pack embeddings into a matrix. Multiply this matrix by the Q, K, V matrices to produce an output matrix of the results.
2. Matrix multiply result of Q layer and transpose of K layer. Divide result by square root and pass through softmax function. Perform a matrix multiple of the result and the output from V layer.

Wrote this as one step because this is considered one step in the blog post since we are dealing with matrices. Can easily do this in a single line of code if needed.

### Multi-Head Attention

With multi-headed attention, we maintain separate Q/K/V matrices for each head resulting in different Q/K/V matrices. Purpose is to expand the model's ability to focus on different positions. In the example above, the result from the attention mechanism contains a little information from other words (due to the Q * Key of that word), but it can be dominated by the word itself. **It gives the attention layer multiple “representation subspaces”.**

The result from this is going to be a Z matrix for every head. The feed forward network is only expecting one matrix. We need a way to combine these matrices into one in a way that still contains all the information and context present when they are separate matrices. 

The blog suggests concatenating them and multiplying the matrix by another W matrix, that is trained as well. The result would be the Z matrix that captures information
from all the attention heads. We can send this forward to the FFNN.

### Positional Encodings

We are missing positional encodings in this description. These allow the model take into account where in the input words occur. We use postional encoding vectors and combine them with the input embeddings to generate a new vector that has taken the positional encodings into account. This gives the model a sense of the order of the words.

**The intuition here is that adding these values to the embeddings provides meaningful distances between the embedding vectors once they’re projected into Q/K/V vectors and during dot-product attention.**

The blog does not go into detail on how positional encodings are derived. Might have to research that myself.

### Decoder Side

The encoder starts by processing input embeddings. There can be multiple encoders in a transformer architecture. The decoder takes the output of the last encoder as input, which is an attention weighted embedding vector for each word.

The decoder then has to use its encoder-decoder attention mechanism to deconstruct this into K and V matrices. It does this by passing this input into its own K/V matrices with weights that have been trained. The decoder itself provides the Q values.

1. Input to K and V: The encoder’s output (let’s call it H_enc) is passed into the encoder-decoder attention mechanism.
2. Linear Projections: Inside this attention layer, H_enc is transformed into K and V using two separate learned weight matrices:
    * K = H_enc * W_K (where W_K is the key projection matrix).
    * V = H_enc * W_V (where W_V is the value projection matrix).
3. Query from Decoder: The decoder generates its own Q vectors from its current internal state (the output of the masked self-attention layer), using a third weight matrix: Q = H_dec * W_Q.
4. Attention Computation: The attention scores are computed as Q · K^T, normalized with softmax, and used to weight the V vectors, producing the final output of this layer.

In the decoder, the self-attention layer is slightly different than the encoder. It is only allowed to attend to earlier positions in the output sentence here. This is done by by masking future positions (setting them to -inf) before the softmax step in the self-attention calculation.

The output of the final decoder produces a vector of floats, but we need to translate this into a word. That’s the job of the final Linear layer which is followed by a Softmax Layer.

The linear layer is a FFNN that converts the decoder vector into a much, much larger vector called a logits vector. The logits vector is the same length as the output vocabulary. For exmaple, if the output vocab is a 10,000 unique english words then the logits vector would be of length 10,000 - with each cell in the vector corresponding to a unique word in the vocab. The softmax layer then turns those scores into probabilities (all positive, all add up to 1.0). The cell with the highest probability is chosen, and the word associated with it is produced as the output for this time step.

### Steps:
1. Word Embeddings -> Convert input words into embeddings, or vectors, that represent numerically what the word is.
2. Word embeddings for each word are passed through the each of the two layers of the Encoder (Self-Attention -> Feed Forward)
    * each word flows through the exact same network individually
    * each word embedding follows its own path through the encoder

3. 