In [1]:
from typing import Dict, List, Union, Tuple, Optional, Any
import numpy as np
import matplotlib.pyplot as plt

# Import our custom tokenizer and embedding from previous notebook
from utils.tokenizer import Tokenizer
from utils.embedding_pc import Embedding,get_positional_encoding

## Re-do tokenization and embedding again to emphaisis it in my memory

In [14]:
texts = [
    "the quick brown fox jumps over the lazy dog",
    "sentence two"
]

tokens = ''.join(texts).split() 
# Create tokenizer
tokenizer = Tokenizer()

# Fit tokenizer on texts
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)

In [16]:


d_model = 64  # Larger dimension for demonstration
embedding = Embedding(
    vocab_size=tokenizer.vocab_size,
    d_model=d_model,
    padding_idx=tokenizer.word_to_index[tokenizer.pad_token]
)

token_idxs = [tokenizer.word_to_index.get(word,\
                    tokenizer.word_to_index[tokenizer.unk_token]) 
              for word in tokens]
token_embeddings = np.array([embedding(token_idx) for token_idx in token_idxs])
# 9 words/9 tokens with 5 embedding dimension
print(f'Dimension of senetence one in texts is {token_embeddings.shape}')

# Generate positional encodings, aka noumber of tokens
seq_length: int = len(token_embeddings)
pos_encodings: np.ndarray = get_positional_encoding(seq_length, d_model=d_model)

# Add positional encodings to token embeddings
token_pos_embeddings: np.ndarray = token_embeddings + pos_encodings

print(f'Dimension of after positional encoding is {token_pos_embeddings.shape}')

Dimension of senetence one in texts is (10, 64)
The shape of the positional encoding is (10, 64)
Dimension of after positional encoding is (10, 64)


## Self-attention

In [4]:
## Logic
## 1. Input sequence (Embedding of the tokens) perform matrix multiplication Q,v,K individually/.
## 2. Split into multiple heads, d_model required to be divisible by num_heads
## 3. Each head has dimension (seq_length,d_head) -> d_head == d_model/num_heads
## 4. Compute attention weights for each head;
## 5. attention weights are matrix muplification by Q * K.t, shape is (seq_len,seq_len)
## 6. at mask to attnetion weights if set. add a large negative value (like -1e9) before softmax, which effectively results in near-zero attention after softmax is applied
## 7. Softmax the attnetion weights to become 0 to 1, which mean relation/attention between each Q to each key
## 8. attention weight * Value(seq_len,d_head) -> head's attention score (seq_len,d_head)
## 9. Value in 2nd position since d_head need to return.
## 10. Concat the list of attention scors to (seq_len,d_model)
## 11. Linear projection by fully connected layer to (seq_len,d_model)
## 12. Added residual to the output, residual is the embedding with positional enocoding
## 13. output the attention weight for current attention block

In [5]:
# Create self-attention layer
num_heads = 4
attention = SelfAttention(d_model=d_model, num_heads=num_heads)

# Run self-attention
output, attention_weights = attention.forward(token_pos_embeddings)
print(f"Self-attention output shape: {output.shape}")

Self-attention output shape: (9, 64)


In [6]:
class SelfAttention:
    def __init__(self, d_model: int, num_heads: int = 8):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_head = d_model // num_heads
        assert (num_heads > 1), "Number of heads should be greater than 1"
        assert (d_model % num_heads == 0), "d_model must be divisible by num_heads"
        np.random.seed(42)  # For reproducibility
        self.W_q = np.random.randn(d_model,d_model) * 0.1 #(d_model,d_dmodel)
        self.W_k = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
        self.W_v = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
        self.W_o = np.random.randn(d_model, d_model) * 0.1  # Shape: (d_model, d_model)
    
    def split_heads(self, x: np.array) ->np.array:
        # x from (batch,seq_len, d_model) to (batch_size, num_head,seq_len,d_head)
    
        batch_size, seq_len , _ = x.shape
        # reshape from (batch,seq_len, d_model) to (batch,seq_len, num_heads, d_heads)
        x = x.reshape(batch_size,seq_len,self.num_heads,self.d_head)

        return x.transpose(0,2,1,3)
    def combine_head(self, x: np.array)->np.array:
        # Combine all hteads back to d_model
        # From (batch_size,num_heads,seq_len,d_head)
        # to (batch_size,seq_len,d_model)
    
        batch_size , _ , seq_len , _ =x.shape
        # (seq_len,num_head,d_head)
        x = x.transpose(0,2,1,3)
        return x.reshape(batch_size,seq_len,self.d_model)
    def forward(self,x:np.array) -> np.array:
        """
        Apply self-attention to input with residual connection.
        
        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
            
        Returns:
            Output tensor of shape (batch_size, seq_len, d_model)
        """
        assert (len(x.shape)==3), 'Dimension of input need to be 3. (Batch_size,seq_len,d_model)'
        residual = x.copy()
        batch_size,seq_len,_ = x.shape

        # linear projects
        # (batch_size, seq_len, d_model) * (d_model,d_model)
        q = np.dot(x,self.W_q)
        k = np.dot(x,self.W_k)
        v = np.dot(x,self.W_v)
        # (batch_size, seq_len, d_model) 
        q = self.split_heads(q)  # Shape: (num_heads, seq_len, d_head)
        k = self.split_heads(k)  # Shape: (num_heads, seq_len, d_head)
        v = self.split_heads(v)  # Shape: (num_heads, seq_len, d_head)
        
        head_outputs = []
        all_weights = []
        for head_idx in range(self.num_heads):
            head_q = q[head_idx] # one head (batch_size,seq_len,d_head)
            head_k = k[head_idx]  # Shape: (batch_size,seq_len, d_head)
            head_v = v[head_idx]  # Shape: (batch_size,seq_len, d_head)

            # Attention score in current head
            # Querys check with all keys
            # (batch_size,seq_len,d_head)
            scores = np.matmul(head_q, head_k.transpose(0, 2, 1)) / np.sqrt(self.d_head)

            # apply softmax on d_head 
            weights = self._softmax(scores)
            all_weights.append(weights)

            head_output = np.dot(weights,head_v)

            head_outputs.append(head_output)
        # shape: (batch_size, num_heads, seq_len, d_head)
        head_outputs = np.stack(head_outputs)
        # shape: (batch_size, seq_len,d_mdoel)
        output = self.combine_heads(head_outputs)
        # Stack weights for returning
        weights = np.stack(all_weights)
        # Linear project
        output = np.dot(output, self.W_o)  # Shape: (batch_size, seq_len, d_model)
        output = output + residual  # Shape: (batch_size, seq_len, d_model)
        return output, weights

    def _softmax(self, x: np.ndarray, axis: int = -1) -> np.ndarray:
        x_max = np.max(x, axis=axis, keepdims=True)
        e_x = np.exp(x - x_max)
        return e_x / np.sum(e_x, axis=axis, keepdims=True)

    

In [7]:
# Create self-attention layer
num_heads = 4
attention = SelfAttention(d_model=d_model, num_heads=num_heads)

# Run self-attention
output, attention_weights = attention.forward(token_pos_embeddings)
print(f"Self-attention output shape: {output.shape}")

AssertionError: Dimension of input need to be 3. (Batch_size,seq_len,d_model)