In [1]:
import numpy as np

In [32]:
def softmax(x):
    x_exp = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return x_exp / x_exp.sum(axis=-1, keepdims=True)

In [35]:
def masked_self_attention(X, Wq, Wk, Wv):
    """
    X : n x d_model
    Wq, Wk, Wv: d_model x d_k
    """

    Q = X @ Wq
    K = X @ Wk
    V = X @ Wv

    n, d_k = X.shape[0], Q.shape[-1]
    
    scores = Q @ K.T / np.sqrt(d_k)
    
    mask = np.triu(np.ones((n, n)), k=1)

    scores[mask == 1] = -np.inf

    weights = softmax(scores)

    output = weights @ V

    return output, weights

In [36]:
import numpy as np

# 1. Create a tiny sequence: 3 tokens, each with 4 dimensions
# Think of this as "I (row 0) love (row 1) AI (row 2)"
X = np.array([
    [1.0, 0.0, 1.0, 0.0],
    [0.0, 2.0, 0.0, 2.0],
    [1.0, 1.0, 1.0, 1.0]
], dtype=np.float32)

# 2. Simplified weights for testing (Identity-like)
d_model = 4
d_k = 4
Wq = np.eye(d_model, d_k)
Wk = np.eye(d_model, d_k)
Wv = np.eye(d_model, d_k)

# 3. Run your function
context_vectors, attention_weights = masked_self_attention(X, Wq, Wk, Wv)

print("Attention Weights (Should be lower triangular):\n", np.round(attention_weights, 4))
print("\nContext Vectors (The 'Output'):\n", np.round(context_vectors, 4))

# 4. Verify the mask
if np.all(np.triu(attention_weights, k=1) == 0):
    print("\nSuccess: The mask is working. Future tokens are hidden.")
else:
    print("\nError: The mask failed. Future tokens are leaking into the past.")

Attention Weights (Should be lower triangular):
 [[1.     0.     0.    ]
 [0.018  0.982  0.    ]
 [0.1554 0.4223 0.4223]]

Context Vectors (The 'Output'):
 [[1.     0.     1.     0.    ]
 [0.018  1.964  0.018  1.964 ]
 [0.5777 1.267  0.5777 1.267 ]]

Success: The mask is working. Future tokens are hidden.
