<a href="https://colab.research.google.com/github/inderpreetsingh01/ml_machine_coding/blob/main/attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# attention

In [1]:
import numpy as np

def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)  # stability
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute scaled dot-product attention
    Q, K, V: shape (batch, seq_len, d_k)
    mask: shape (batch, seq_len, seq_len), optional
    """
    d_k = Q.shape[-1]

    # (batch, seq_len, seq_len)
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)

    if mask is not None:
        scores = np.where(mask == 0, -1e9, scores)  # mask out

    # attention weights
    attn_weights = softmax(scores, axis=-1)

    # weighted sum of values
    output = np.matmul(attn_weights, V)

    return output, attn_weights

In [2]:
# Example: batch=1, seq_len=3, d_k=4
np.random.seed(42)
Q = np.random.rand(1, 3, 4)
K = np.random.rand(1, 3, 4)
V = np.random.rand(1, 3, 4)

output, attn_weights = scaled_dot_product_attention(Q, K, V)

print("Attention output:\n", output)
print("\nAttention weights:\n", attn_weights)

Attention output:
 [[[0.38238456 0.56336845 0.59419359 0.48028741]
  [0.36781777 0.59386381 0.59857094 0.49987658]
  [0.37190176 0.5920136  0.59070988 0.49675455]]]

Attention weights:
 [[[0.31164829 0.37066075 0.31769096]
  [0.32266579 0.33486977 0.34246444]
  [0.33283257 0.33507568 0.33209175]]]


In [None]:
# multihead attention

In [3]:
import numpy as np

def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
    if mask is not None:
        scores = np.where(mask == 0, -1e9, scores)
    attn_weights = softmax(scores, axis=-1)
    output = np.matmul(attn_weights, V)
    return output, attn_weights

class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # Initialize weights
        self.W_q = np.random.randn(d_model, d_model) / np.sqrt(d_model)
        self.W_k = np.random.randn(d_model, d_model) / np.sqrt(d_model)
        self.W_v = np.random.randn(d_model, d_model) / np.sqrt(d_model)
        self.W_o = np.random.randn(d_model, d_model) / np.sqrt(d_model)

    def split_heads(self, X):
        """
        Split the last dimension into (num_heads, d_k) and transpose to (batch, heads, seq_len, d_k)
        """
        batch_size, seq_len, d_model = X.shape
        X = X.reshape(batch_size, seq_len, self.num_heads, self.d_k)
        return X.transpose(0, 2, 1, 3)

    def combine_heads(self, X):
        """
        Combine heads: (batch, heads, seq_len, d_k) -> (batch, seq_len, d_model)
        """
        batch_size, num_heads, seq_len, d_k = X.shape
        X = X.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, num_heads * d_k)
        return X

    def forward(self, X, mask=None):
        batch_size, seq_len, _ = X.shape

        Q = X @ self.W_q
        K = X @ self.W_k
        V = X @ self.W_v

        # Split heads
        Q = self.split_heads(Q)  # (batch, heads, seq_len, d_k)
        K = self.split_heads(K)
        V = self.split_heads(V)

        # Apply attention on each head
        heads_output = []
        self.attn_weights = []
        for i in range(self.num_heads):
            out, attn_w = scaled_dot_product_attention(Q[:, i], K[:, i], V[:, i], mask)
            heads_output.append(out)
            self.attn_weights.append(attn_w)

        # Stack heads (batch, heads, seq_len, d_k)
        heads_output = np.stack(heads_output, axis=1)

        # Combine heads
        concat = self.combine_heads(heads_output)

        # Final linear projection
        output = concat @ self.W_o
        return output

In [4]:
np.random.seed(42)
batch_size = 2
seq_len = 4
d_model = 8
num_heads = 2

X = np.random.rand(batch_size, seq_len, d_model)

mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
output = mha.forward(X)

print("Input shape:", X.shape)
print("Output shape:", output.shape)

Input shape: (2, 4, 8)
Output shape: (2, 4, 8)
