In [17]:
import numpy as np


In [19]:
def get_token_embeddings(vocab_size, d_model):
    # Random embedding matrix
    return np.random.randn(vocab_size, d_model) * 0.01

def get_positional_encodings(max_len, d_model):
    PE = np.zeros((max_len, d_model))
    position = np.arange(0, max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    
    PE[:, 0::2] = np.sin(position * div_term)
    PE[:, 1::2] = np.cos(position * div_term)
    
    return PE  # shape (max_len, d_model)


In [21]:
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = Q @ K.transpose(0, 1, 3, 2) / np.sqrt(d_k)
    
    weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    weights /= np.sum(weights, axis=-1, keepdims=True)
    
    output = weights @ V
    return output


In [23]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Projection matrices
        self.W_q = np.random.randn(d_model, d_model)
        self.W_k = np.random.randn(d_model, d_model)
        self.W_v = np.random.randn(d_model, d_model)
        self.W_o = np.random.randn(d_model, d_model)
    
    def split_heads(self, X):
        # Shape: (batch, seq_len, d_model) → (batch, heads, seq_len, d_k)
        B, T, D = X.shape
        X = X.reshape(B, T, self.num_heads, self.d_k).transpose(0, 2, 1, 3)
        return X
    
    def combine_heads(self, X):
        # (batch, heads, seq_len, d_k) → (batch, seq_len, d_model)
        B, H, T, Dk = X.shape
        return X.transpose(0, 2, 1, 3).reshape(B, T, H * Dk)
    
    def __call__(self, X):
        Q = X @ self.W_q
        K = X @ self.W_k
        V = X @ self.W_v
        
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)
        
        attention = scaled_dot_product_attention(Q, K, V)
        concat = self.combine_heads(attention)
        return concat @ self.W_o


In [25]:
class LayerNorm:
    def __init__(self, d_model, eps=1e-6):
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps
    
    def __call__(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        normalized = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * normalized + self.beta


In [27]:
class FeedForward:
    def __init__(self, d_model, d_ff):
        self.W1 = np.random.randn(d_model, d_ff)
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.randn(d_ff, d_model)
        self.b2 = np.zeros(d_model)
    
    def __call__(self, x):
        x = np.maximum(0, x @ self.W1 + self.b1)  # ReLU
        return x @ self.W2 + self.b2


In [29]:
class TransformerEncoderLayer:
    def __init__(self, d_model, num_heads, d_ff):
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm2 = LayerNorm(d_model)
    
    def __call__(self, x):
        # Self-attention + residual
        attn_out = self.self_attn(x)
        x = self.norm1(x + attn_out)
        
        # Feedforward + residual
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        
        return x


In [31]:
def run_encoder_demo():
    vocab_size = 10000
    d_model = 512
    num_heads = 8
    d_ff = 2048
    seq_len = 10
    batch_size = 2

    # Fake token IDs
    tokens = np.random.randint(0, vocab_size, size=(batch_size, seq_len))

    # Create embeddings
    E = get_token_embeddings(vocab_size, d_model)
    PE = get_positional_encodings(seq_len, d_model)

    # Embed + add position
    X = E[tokens] + PE[np.newaxis, :seq_len, :]

    # Encoder
    encoder = TransformerEncoderLayer(d_model, num_heads, d_ff)
    output = encoder(X)  # shape (batch, seq_len, d_model)

    print("Encoder output shape:", output.shape)

run_encoder_demo()


Encoder output shape: (2, 10, 512)
