In [1]:
from typing import Dict, List, Union, Tuple, Optional, Any
import numpy as np
import matplotlib.pyplot as plt

# Import our custom tokenizer and embedding from previous notebook
from utils.tokenizer import Tokenizer
from utils.embedding_pc import Embedding,get_positional_encoding

## Re-do tokenization and embedding again to emphaisis it in my memory

In [2]:
# Sample data for tokenizer
texts = [
    "the quick brown fox jumps over the lazy dog",
]

# Create tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on texts
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

d_model = 6  # Small dimension for demonstration
embedding = Embedding(
    vocab_size=tokenizer.vocab_size,
    d_model=d_model,
    padding_idx=tokenizer.word_to_index[tokenizer.pad_token]
)

token_idxs = [tokenizer.word_to_index.get(word,\
                    tokenizer.word_to_index[tokenizer.unk_token]) 
              for word in texts[0].split()]
token_embeddings = np.array([embedding(token_idx) for token_idx in token_idxs])
# 9 words/9 tokens with 5 embedding dimension
print(f'Dimension of senetence one in texts is {token_embeddings.shape}')

# Generate positional encodings, aka noumber of tokens
seq_length: int = len(token_embeddings)
pos_encodings: np.ndarray = get_positional_encoding(seq_length, d_model=d_model)

# Add positional encodings to token embeddings
token_pos_embeddings: np.ndarray = token_embeddings + pos_encodings

print(f'Dimension of after positional encoding is {token_pos_embeddings.shape}')

Dimension of senetence one in texts is (9, 5)
The shape of the positional encoding is (9, 5)
Dimension of after positional encoding is (9, 5)


## Self-attention

In [None]:
# QVK in numpy of 
class SelfAttention:
    def __init__(self, d_model: int, num_heads: int = 2):
        """
        Initialize a self-attention layer.
        
        Args:
            d_model: Dimensionality of the input embeddings
            num_heads: Number of attention heads (defaults to 2 for basic implementation)
        """
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_head = d_model // num_heads if num_heads > 1 else d_model
        
        # Initialize weights for Q, K, V projections
        # For a basic implementation, we'll use random initialization
        np.random.seed(42)  # For reproducibility
        self.W_q = np.random.randn(d_model, d_model) * 0.1
        self.W_k = np.random.randn(d_model, d_model) * 0.1
        self.W_v = np.random.randn(d_model, d_model) * 0.1
        self.W_o = np.random.randn(d_model, d_model) * 0.1
    
    def split_heads(self, x: np.ndarray) -> np.ndarray:
        """
        Split the last dimension into (num_heads, d_head)
        
        Args:
            x: Input tensor of shape (seq_len, d_model)
            
        Returns:
            Tensor of shape (num_heads, seq_len, d_head)
        """
        batch_size, seq_len = 1, x.shape[0]  # Assuming no batch dimension for simplicity
        
        # Reshape to (seq_len, num_heads, d_head)
        x = x.reshape(seq_len, self.num_heads, self.d_head)
        
        # Transpose to (num_heads, seq_len, d_head)
        return x.transpose(1, 0, 2)
    
    def combine_heads(self, x: np.ndarray) -> np.ndarray:
        """
        Combine heads back to original shape
        
        Args:
            x: Input tensor of shape (num_heads, seq_len, d_head)
            
        Returns:
            Tensor of shape (seq_len, d_model)
        """
        # Transpose to (seq_len, num_heads, d_head)
        x = x.transpose(1, 0, 2)
        
        # Combine last two dimensions
        batch_size, seq_len = 1, x.shape[0]
        return x.reshape(seq_len, self.d_model)
    
    def forward(self, x: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
        """
        Apply self-attention to input.
        
        Args:
            x: Input tensor of shape (seq_len, d_model)
            mask: Optional mask tensor of shape (seq_len, seq_len)
            
        Returns:
            Output tensor of shape (seq_len, d_model)
        """
        seq_len = x.shape[0]
        
        # Linear projections
        q = np.dot(x, self.W_q)  # (seq_len, d_model)
        k = np.dot(x, self.W_k)  # (seq_len, d_model)
        v = np.dot(x, self.W_v)  # (seq_len, d_model)
        
        if self.num_heads > 1:
            # Split into heads
            q = self.split_heads(q)  # (num_heads, seq_len, d_head)
            k = self.split_heads(k)  # (num_heads, seq_len, d_head)
            v = self.split_heads(v)  # (num_heads, seq_len, d_head)
            
            # Process each head independently
            head_outputs = []
            for head_idx in range(self.num_heads):
                head_q = q[head_idx]  # (seq_len, d_head)
                head_k = k[head_idx]  # (seq_len, d_head)
                head_v = v[head_idx]  # (seq_len, d_head)
                
                # Compute attention scores
                scores = np.dot(head_q, head_k.T)  # (seq_len, seq_len)
                
                # Scale attention scores
                scores = scores / np.sqrt(self.d_head)
                
                # Apply mask if provided
                if mask is not None:
                    scores = scores + (mask * -1e9)
                
                # Apply softmax to get attention weights
                weights = self._softmax(scores)  # (seq_len, seq_len)
                
                # Apply attention weights to values
                head_output = np.dot(weights, head_v)  # (seq_len, d_head)
                head_outputs.append(head_output)
            
            # Concatenate heads
            output = np.concatenate(head_outputs, axis=-1)  # (seq_len, d_model)
        else:
            # Compute attention scores
            scores = np.dot(q, k.T)  # (seq_len, seq_len)
            
            # Scale attention scores
            scores = scores / np.sqrt(self.d_model)
            
            # Apply mask if provided
            if mask is not None:
                scores = scores + (mask * -1e9)
            
            # Apply softmax to get attention weights
            weights = self._softmax(scores)  # (seq_len, seq_len)
            
            # Apply attention weights to values
            output = np.dot(weights, v)  # (seq_len, d_model)
        
        # Final linear projection
        output = np.dot(output, self.W_o)  # (seq_len, d_model)
        
        return output, weights  # Return attention weights for visualization
    
    def _softmax(self, x: np.ndarray, axis: int = -1) -> np.ndarray:
        """
        Compute softmax values for each set of scores in x.
        
        Args:
            x: Input array
            axis: Axis along which to apply softmax
            
        Returns:
            Softmax values
        """
        # Subtract max for numerical stability
        x_max = np.max(x, axis=axis, keepdims=True)
        e_x = np.exp(x - x_max)
        return e_x / np.sum(e_x, axis=axis, keepdims=True)