# Section 1: Fundamentals in RNNs — Worked Solution
*Generated on 2025-11-04 11:52:06*

This notebook implements a **manual multi-timestep RNN** in PyTorch and answers all parts (1)–(6).

In [1]:
# (0) Import
import torch
torch.set_printoptions(sci_mode=False, precision=4)
torch.manual_seed(0)


<torch._C.Generator at 0x10d02f730>

In [2]:
# (Given) Declare variables
input_size = 5
seq_len = 4
batch_size = 8
hidden_size = 3
num_classes = 3

# Create random inputs and random labels
inputs = torch.randn(batch_size, seq_len, input_size)
random_labels = torch.randint(0, num_classes, (batch_size,))
print("inputs.shape:", inputs.shape)
print("random_labels:", random_labels)


inputs.shape: torch.Size([8, 4, 5])
random_labels: tensor([1, 2, 2, 2, 2, 0, 0, 0])


## (1) Declare model parameters U, W, V, b, c

In [3]:
# U: (input_size, hidden_size)
# W: (hidden_size, hidden_size)
# b: (hidden_size,)
# V: (hidden_size, num_classes)
# c: (num_classes,)

# Kaiming initialization for weights; zeros for biases
U = torch.nn.Parameter(torch.randn(input_size, hidden_size) * (2/hidden_size)**0.5)
W = torch.nn.Parameter(torch.randn(hidden_size, hidden_size) * (2/hidden_size)**0.5)
b = torch.nn.Parameter(torch.zeros(hidden_size))
V = torch.nn.Parameter(torch.randn(hidden_size, num_classes) * (2/hidden_size)**0.5)
c = torch.nn.Parameter(torch.zeros(num_classes))

params = [U, W, b, V, c]
for p in params:
    p.requires_grad_(True)

print("U:", U.shape, "W:", W.shape, "b:", b.shape, "V:", V.shape, "c:", c.shape)


U: torch.Size([5, 3]) W: torch.Size([3, 3]) b: torch.Size([3]) V: torch.Size([3, 3]) c: torch.Size([3])


## (2) Compute `hiddens` (batch_size, seq_len, hidden_size) using a simple RNN cell

In [4]:
# Initialize hiddens for update
hiddens = torch.zeros(batch_size, seq_len, hidden_size)

# Manual simple RNN forward: h_t = tanh(x_t @ U + h_{t-1} @ W + b)
h_prev = torch.zeros(batch_size, hidden_size)
for t in range(seq_len):
    x_t = inputs[:, t, :]                               # (batch_size, input_size)
    h_t = torch.tanh(x_t @ U + h_prev @ W + b)          # (batch_size, hidden_size)
    hiddens[:, t, :] = h_t
    h_prev = h_t

print("hiddens.shape:", hiddens.shape)
print("last hidden sample (first 2 rows):\n", hiddens[:2, -1])


hiddens.shape: torch.Size([8, 4, 3])
last hidden sample (first 2 rows):
 tensor([[ 0.9996, -0.9977, -0.9888],
        [-0.9293,  0.9716, -0.8190]], grad_fn=<SelectBackward0>)


## (3) Compute logits from the **last** hidden state

In [5]:
# Take last hidden state: (batch_size, hidden_size)
h_last = hiddens[:, -1, :]
logits = h_last @ V + c                                  # (batch_size, num_classes)
print("logits.shape:", logits.shape)
print(logits[:3])


logits.shape: torch.Size([8, 3])
tensor([[ 1.3990,  0.2307,  0.7709],
        [-1.1064,  0.1209,  0.8236],
        [-0.6812, -0.2259,  0.7966]], grad_fn=<SliceBackward0>)


## (4) Cross-entropy loss vs. labels

In [9]:
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(logits, random_labels)
print("loss:", loss.item())
print(loss)

loss: 1.2063236236572266
tensor(1.2063, grad_fn=<NllLossBackward0>)


## (5) Back-propagation to compute gradients w.r.t. parameters

In [10]:
# Zero grads (in case this cell is re-run)
for p in params:
    if p.grad is not None:
        p.grad.zero_()

# Recompute forward (so graph is intact) and backward
h_prev = torch.zeros(batch_size, hidden_size)
hiddens = torch.zeros(batch_size, seq_len, hidden_size)
for t in range(seq_len):
    x_t = inputs[:, t, :]
    h_prev = torch.tanh(x_t @ U + h_prev @ W + b)
    hiddens[:, t, :] = h_prev

h_last = hiddens[:, -1, :]
logits = h_last @ V + c
loss = criterion(logits, random_labels)
loss.backward()

for name, p in zip(["U","W","b","V","c"], params):
    print(f"{name}.grad norm: {p.grad.norm().item():.6f}")


U.grad norm: 0.374913
W.grad norm: 0.284781
b.grad norm: 0.174993
V.grad norm: 0.721826
c.grad norm: 0.295514


## (6) Manual SGD update with learning rate η = 0.1

In [8]:
lr = 0.1
with torch.no_grad():
    for p in params:
        p -= lr * p.grad

# Optionally compute loss again after one update
# (detach inputs to avoid autograd tracking across steps)
h_prev = torch.zeros(batch_size, hidden_size)
hiddens = torch.zeros(batch_size, seq_len, hidden_size)
for t in range(seq_len):
    x_t = inputs[:, t, :]
    h_prev = torch.tanh(x_t @ U + h_prev @ W + b)
    hiddens[:, t, :] = h_prev

new_logits = hiddens[:, -1, :] @ V + c
new_loss = criterion(new_logits, random_labels)
print("loss after one SGD step:", new_loss.item())


loss after one SGD step: 1.1136271953582764
