In [1]:
from typing import Dict, List, Union, Tuple, Optional, Any
import numpy as np
import matplotlib.pyplot as plt

# Import our custom tokenizer and embedding from previous notebook
from utils.tokenizer import Tokenizer
from utils.embedding_pc import Embedding,get_positional_encoding

## Re-do tokenization and embedding again to emphaisis it in my memory

In [2]:
# Sample data for tokenizer
texts = [
    "the quick brown fox jumps over the lazy dog",
]

tokens = texts[0].split()

# Create tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on texts
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

d_model = 64  # Larger dimension for demonstration
embedding = Embedding(
    vocab_size=tokenizer.vocab_size,
    d_model=d_model,
    padding_idx=tokenizer.word_to_index[tokenizer.pad_token]
)

token_idxs = [tokenizer.word_to_index.get(word,\
                    tokenizer.word_to_index[tokenizer.unk_token]) 
              for word in texts[0].split()]
token_embeddings = np.array([embedding(token_idx) for token_idx in token_idxs])
# 9 words/9 tokens with 5 embedding dimension
print(f'Dimension of senetence one in texts is {token_embeddings.shape}')

# Generate positional encodings, aka noumber of tokens
seq_length: int = len(token_embeddings)
pos_encodings: np.ndarray = get_positional_encoding(seq_length, d_model=d_model)

# Add positional encodings to token embeddings
token_pos_embeddings: np.ndarray = token_embeddings + pos_encodings

print(f'Dimension of after positional encoding is {token_pos_embeddings.shape}')

Dimension of senetence one in texts is (9, 64)
The shape of the positional encoding is (9, 64)
Dimension of after positional encoding is (9, 64)


## Self-attention

In [3]:
import numpy as np

# SelfAttention class implementation
class SelfAttention:
    def __init__(self, d_model: int, num_heads: int = 8):
        """
        Initialize a self-attention layer.
        
        Args:
            d_model: Dimensionality of the input embeddings
            num_heads: Number of attention heads (defaults to 8)
        """
        self.d_model = d_model  # Dimension of model embeddings
        self.num_heads = num_heads  # Number of attention heads
        self.d_head = d_model // num_heads if num_heads > 1 else d_model  # Dimension of each head
        
        # Initialize weights for Q, K, V projections
        # For a basic implementation, we'll use random initialization
        np.random.seed(42)  # For reproducibility
        self.W_q = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
        self.W_k = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
        self.W_v = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
        self.W_o = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
    
    def split_heads(self, x: np.ndarray) -> np.ndarray:
        """
        Split the last dimension into (num_heads, d_head)
        
        Args:
            x: Input tensor of shape (seq_len, d_model)
            
        Returns:
            Tensor of shape (num_heads, seq_len, d_head)
        """
        batch_size, seq_len = 1, x.shape[0]  # Assuming no batch dimension for simplicity
        
        # Reshape to (seq_len, num_heads, d_head)
        x = x.reshape(seq_len, self.num_heads, self.d_head)  # Shape: (seq_len, num_heads, d_head)
        
        # Transpose to (num_heads, seq_len, d_head)
        return x.transpose(1, 0, 2)  # Shape: (num_heads, seq_len, d_head)
    
    def combine_heads(self, x: np.ndarray) -> np.ndarray:
        """
        Combine heads back to original shape
        
        Args:
            x: Input tensor of shape (num_heads, seq_len, d_head)
            
        Returns:
            Tensor of shape (seq_len, d_model)
        """
        # Transpose to (seq_len, num_heads, d_head)
        x = x.transpose(1, 0, 2)  # Shape: (seq_len, num_heads, d_head)
        
        # Combine last two dimensions
        batch_size, seq_len = 1, x.shape[0]
        return x.reshape(seq_len, self.d_model)  # Shape: (seq_len, d_model)
    
    def forward(self, x: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
        """
        Apply self-attention to input with residual connection.
        
        Args:
            x: Input tensor of shape (seq_len, d_model)
            mask: Optional mask tensor of shape (seq_len, seq_len)
            
        Returns:
            Output tensor of shape (seq_len, d_model)
        """
        seq_len = x.shape[0]  # Sequence length
        
        # Store the input for the residual connection
        residual = x  # Shape: (seq_len, d_model)
        
        # Linear projections
        q = np.dot(x, self.W_q)  # Shape: (seq_len, d_model)
        k = np.dot(x, self.W_k)  # Shape: (seq_len, d_model)
        v = np.dot(x, self.W_v)  # Shape: (seq_len, d_model)
        
        if self.num_heads > 1:
            # Split into heads
            q = self.split_heads(q)  # Shape: (num_heads, seq_len, d_head)
            k = self.split_heads(k)  # Shape: (num_heads, seq_len, d_head)
            v = self.split_heads(v)  # Shape: (num_heads, seq_len, d_head)
            
            # Process each head independently
            head_outputs = []
            all_weights = []
            for head_idx in range(self.num_heads):
                head_q = q[head_idx]  # Shape: (seq_len, d_head)
                head_k = k[head_idx]  # Shape: (seq_len, d_head)
                head_v = v[head_idx]  # Shape: (seq_len, d_head)
                
                # Compute attention scores
                scores = np.dot(head_q, head_k.T)  # Shape: (seq_len, seq_len)
                
                # Scale attention scores
                scores = scores / np.sqrt(self.d_head)  # Shape: (seq_len, seq_len)
                
                # Apply mask if provided
                if mask is not None:
                    scores = scores + (mask * -1e9)  # Shape: (seq_len, seq_len)
                
                # Apply softmax to get attention weights
                weights = self._softmax(scores)  # Shape: (seq_len, seq_len)
                all_weights.append(weights)
                
                # Apply attention weights to values
                head_output = np.dot(weights, head_v)  # Shape: (seq_len, d_head)
                head_outputs.append(head_output)
            
            # Stack head outputs for easy concatenation
            head_outputs = np.stack(head_outputs)  # Shape: (num_heads, seq_len, d_head)
            
            # Combine heads
            output = self.combine_heads(head_outputs)  # Shape: (seq_len, d_model)
            
            # Stack weights for returning
            weights = np.stack(all_weights)  # Shape: (num_heads, seq_len, seq_len)
        else:
            # Compute attention scores
            scores = np.dot(q, k.T)  # Shape: (seq_len, seq_len)
            
            # Scale attention scores
            scores = scores / np.sqrt(self.d_model)  # Shape: (seq_len, seq_len)
            
            # Apply mask if provided
            if mask is not None:
                scores = scores + (mask * -1e9)  # Shape: (seq_len, seq_len)
            
            # Apply softmax to get attention weights
            weights = self._softmax(scores)  # Shape: (seq_len, seq_len)
            
            # Apply attention weights to values
            output = np.dot(weights, v)  # Shape: (seq_len, d_model)
        
        # Final linear projection
        output = np.dot(output, self.W_o)  # Shape: (seq_len, d_model)
        
        # Apply residual connection: Add the input to the output
        output = output + residual  # Shape: (seq_len, d_model)
        
        return output, weights  # Return attention weights for visualization
    
    def _softmax(self, x: np.ndarray, axis: int = -1) -> np.ndarray:
        """
        Compute softmax values for each set of scores in x.
        
        Args:
            x: Input array
            axis: Axis along which to apply softmax
            
        Returns:
            Softmax values
        """
        # Subtract max for numerical stability
        x_max = np.max(x, axis=axis, keepdims=True)
        e_x = np.exp(x - x_max)
        return e_x / np.sum(e_x, axis=axis, keepdims=True)


In [4]:
## Logic
## 1. Input sequence (Embedding of the tokens) perform matrix multiplication Q,v,K individually/.
## 2. Split into multiple heads, d_model required to be divisible by num_heads
## 3. Each head has dimension (seq_length,d_head) -> d_head == d_model/num_heads
## 4. Compute attention weights for each head;
## 5. attention weights are matrix muplification by Q * K.t, shape is (seq_len,seq_len)
## 6. at mask to attnetion weights if set. add a large negative value (like -1e9) before softmax, which effectively results in near-zero attention after softmax is applied
## 7. Softmax the attnetion weights to become 0 to 1, which mean relation/attention between each Q to each key
## 8. attention weight * Value(seq_len,d_head) -> head's attention score (seq_len,d_head)
## 9. Value in 2nd position since d_head need to return.
## 10. Concat the list of attention scors to (seq_len,d_model)
## 11. Linear projection by fully connected layer to (seq_len,d_model)
## 12. Added residual to the output, residual is the embedding with positional enocoding
## 13. output the attention weight for current attention block

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
# Create self-attention layer
num_heads = 4
attention = SelfAttention(d_model=d_model, num_heads=num_heads)

# Run self-attention
output, attention_weights = attention.forward(token_pos_embeddings)
print(f"Self-attention output shape: {output.shape}")

# Print attention highlights for the word "fox"
fox_idx = tokens.index("fox")
print(f"\nAttention highlights for 'fox':")
if num_heads > 1:
    for i in range(num_heads):
        print(f"Head {i+1}:")
        attention_scores = attention_weights[i][fox_idx]
        for j, token in enumerate(tokens):
            print(f"  {token}: {attention_scores[j]:.4f}")
else:
    attention_scores = attention_weights[fox_idx]
    for j, token in enumerate(tokens):
        print(f"  {token}: {attention_scores[j]:.4f}")

Self-attention output shape: (9, 64)

Attention highlights for 'fox':
Head 1:
  the: 0.0926
  quick: 0.0902
  brown: 0.0975
  fox: 0.1177
  jumps: 0.1210
  over: 0.1277
  the: 0.1292
  lazy: 0.1129
  dog: 0.1111
Head 2:
  the: 0.1165
  quick: 0.1157
  brown: 0.1187
  fox: 0.1242
  jumps: 0.1270
  over: 0.1112
  the: 0.1040
  lazy: 0.0911
  dog: 0.0915
Head 3:
  the: 0.1211
  quick: 0.1147
  brown: 0.1273
  fox: 0.1234
  jumps: 0.1186
  over: 0.1133
  the: 0.1040
  lazy: 0.0895
  dog: 0.0882
Head 4:
  the: 0.0832
  quick: 0.0633
  brown: 0.0757
  fox: 0.1078
  jumps: 0.1477
  over: 0.1609
  the: 0.1453
  lazy: 0.1156
  dog: 0.1005
