In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np


##### Dense - Fully Connected (FC) Layer - nn.Linear()

```nn.Linear(in_features, out_features)```

is equivalent to 

$ y = xW^T+b $ shape [1]

- $ x = [3,4,5] $ shape [1x3]

- $ w = [w_1, w_2, w_3] $ shape [1x3]

- $ b = b $ shape [1]

```in_features = 3```
```out_features = 1```




In [6]:

# Intantiate the layer
layer = nn.Linear(in_features=3, out_features=1, bias=True)  # weight shape = [1×3], bias = [1]
print(f"w = {layer.weight}\n")  # [1 × 3] torch tensor randomly initialized
print(f"b = {layer.bias}\n")    # [1] torch tensor randomly initialized

# Use the layer and check result
x = torch.tensor([1.0, 2.0, 3.0]) # input shape = [3]
y = layer(x) # output shape = [1]
#y_manual = x @ layer.weight.T + layer.bias # output shape = [1]
y_manual = torch.matmul(x, layer.weight.T) + layer.bias # output shape = [1]

assert y == y_manual
print(f"y = {y}\n")
print(f"y_manual = {y_manual}\n")

w = Parameter containing:
tensor([[-0.4424,  0.4990, -0.0855]], requires_grad=True)

b = Parameter containing:
tensor([-0.3704], requires_grad=True)

y = tensor([-0.0713], grad_fn=<ViewBackward0>)

y_manual = tensor([-0.0713], grad_fn=<AddBackward0>)



##### Self-Attention: Minimal Chronological Implementation

In [None]:
# 1. Input: 3 tokens, each with 4 features
X = np.array([
    [1, 0, 1, 0],  # Token 1
    [0, 2, 0, 2],  # Token 2
    [1, 1, 1, 1],  # Token 3
])  # Shape (seq_len=3, d_model=4)

# 2. For reproducibility: manually initialize to identity
W_q = np.eye(4)  # Identity for simplicity
W_k = np.eye(4)
W_v = np.eye(4)

# 3. Compute Q, K, V
Q = X @ W_q  # Shape (3, 4)
K = X @ W_k  # Shape (3, 4)
V = X @ W_v  # Shape (3, 4)

# 4. Compute attention scores (Q @ K^T)
scores = Q @ K.T  # Shape (3, 3)

# 5. Scale scores
d_k = Q.shape[1]
scaled_scores = scores / np.sqrt(d_k)

# 6. Softmax to get attention weights
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # stable softmax
    return e_x / np.sum(e_x, axis=1, keepdims=True)

attention_weights = softmax(scaled_scores)  # Shape (3, 3)

# 7. Compute weighted sum of V
output = attention_weights @ V  # Shape (3, 4)

# 8. Done — print results
print("Attention Weights:\n", attention_weights)
print("Self-Attention Output:\n", output)

Attention Weights:
 [[0.4223188  0.1553624  0.4223188 ]
 [0.01587624 0.86681333 0.11731043]
 [0.1553624  0.4223188  0.4223188 ]]
Self-Attention Output:
 [[0.8446376  0.73304361 0.8446376  0.73304361]
 [0.13318667 1.85093709 0.13318667 1.85093709]
 [0.5776812  1.26695639 0.5776812  1.26695639]]


##### Self-Attention with torch.nn.Linear

In [None]:
# 1. Input: 3 tokens, each with 4 features
X = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],  # Token 1
    [0.0, 2.0, 0.0, 2.0],  # Token 2
    [1.0, 1.0, 1.0, 1.0],  # Token 3
])  # Shape: (seq_len=3, d_model=4)

# 2. Linear layers for Q, K, V. For reproducibility: manually override and initialize to identity
d_model = 4
linear_q = nn.Linear(d_model, d_model, bias=False)
linear_k = nn.Linear(d_model, d_model, bias=False)
linear_v = nn.Linear(d_model, d_model, bias=False)
linear_q.weight.data = torch.eye(d_model)
linear_k.weight.data = torch.eye(d_model)
linear_v.weight.data = torch.eye(d_model)

# 3. Compute Q, K, V
Q = linear_q(X)  # Shape: (3, 4)
K = linear_k(X)  # Shape: (3, 4)
V = linear_v(X)  # Shape: (3, 4)

# 4. Compute attention scores (Q @ K^T)
scores = Q @ K.T  # Shape: (3, 3)

# 5. Scale scores
scaled_scores = scores / torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

# 6. Softmax to get attention weights
attention_weights = F.softmax(scaled_scores, dim=-1)  # Shape: (3, 3)

# 7. Compute weighted sum of V
output = attention_weights @ V  # Shape: (3, 4)

# 8. Done — print results
print("Attention Weights:\n", attention_weights)
print("Self-Attention Output:\n", output)

Attention Weights:
 tensor([[0.4223, 0.1554, 0.4223],
        [0.0159, 0.8668, 0.1173],
        [0.1554, 0.4223, 0.4223]], grad_fn=<SoftmaxBackward0>)
Self-Attention Output:
 tensor([[0.8446, 0.7330, 0.8446, 0.7330],
        [0.1332, 1.8509, 0.1332, 1.8509],
        [0.5777, 1.2670, 0.5777, 1.2670]], grad_fn=<MmBackward0>)
