In [None]:
import numpy as np
np.random.seed(42)

#hyperparams globaux + config
d_model = 8
seq_len = 3

## Implementer le forward pass de la scaled dot-product attention et verifier numeriquement qu'il fonctionne.

In [27]:
# Toy data

Q = np.random.randn(seq_len, d_model)
K = np.random.randn(seq_len, d_model)
V = np.random.randn(seq_len, d_model)

print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)


Q shape: (3, 8)
K shape: (3, 8)
V shape: (3, 8)


In [28]:
# Softmax

def softmax(x):
    """
    Softmax stable numériquement, appliqué sur la dernière dimension
    """
    x = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [30]:
# scaled_dot_product_attention

def scaled_dot_product_attention(Q, K, V):
    """
    Implémentation correcte de la scaled dot-product attention

    Q, K, V : numpy arrays de shape (seq_len, d_model)

    Returns:
    - output : (seq_len, d_model)
    - attention_weights : (seq_len, seq_len)
    """

    # Vérif de sécu 
    assert isinstance(Q, np.ndarray)
    assert isinstance(K, np.ndarray)
    assert isinstance(V, np.ndarray)
    assert Q.ndim == 2 and K.ndim == 2 and V.ndim == 2
    assert Q.shape == K.shape == V.shape

    d_k = Q.shape[-1]

    # 1. Scores d'attention
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)

    # 2. Softmax
    attention_weights = softmax(scores)

    # 3. Agregation pondérée
    output = np.matmul(attention_weights, V)

    return output, attention_weights


In [31]:
# Test

output, attention = scaled_dot_product_attention(Q, K, V)

print("Output shape:", output.shape)
print("Attention weights shape:", attention.shape)
print("\nAttention weights:\n", attention)


Output shape: (3, 8)
Attention weights shape: (3, 3)

Attention weights:
 [[0.41298391 0.14580592 0.44121017]
 [0.14356952 0.70892628 0.1475042 ]
 [0.20032974 0.6737001  0.12597016]]
