<a href="https://colab.research.google.com/github/jcmachicao/deep_learning_2025_curso/blob/main/S02__demo_RNN_traduccion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Toy RNN forward pass (tiny sizes) to illustrate input_size=4, hidden_size=6, output_size=10

In [None]:
import numpy as np
import pandas as pd
from math import exp

In [None]:
# For reproducibility
rng = np.random.default_rng(42)

# Sizes
input_size = 4       # one-hot over 4 Spanish tokens
hidden_size = 6      # "ideas/context" capacity
output_size = 10     # English target vocab (restricted domain)

# Tiny vocabularies
src_vocab = ["el", "perro", "corre", "<eos>"]
tgt_vocab = ["<pad>", "the", "a", "dog", "runs", "in", "on", "park", "<eos>", "quickly"]

# Parameters (initialized with small values)
Wxh = rng.normal(0, 0.3, size=(hidden_size, input_size))   # hidden from input
Whh = rng.normal(0, 0.3, size=(hidden_size, hidden_size))  # hidden from previous hidden
bh  = np.zeros((hidden_size,))                              # hidden bias

Why = rng.normal(0, 0.3, size=(output_size, hidden_size))  # output from hidden
by  = np.zeros((output_size,))                              # output bias

In [None]:
def softmax(z):
    z = z - np.max(z)
    e = np.exp(z)
    return e / np.sum(e)

def tanh(x):
    return np.tanh(x)

In [None]:
# Build an input sequence: "el perro corre <eos>"
sequence = ["el", "perro", "corre", "<eos>"]
X = [np.eye(input_size)[src_vocab.index(tok)] for tok in sequence]

# Forward pass
h_prev = np.zeros((hidden_size,))
timesteps_rows = []
outputs_rows = []

for t, x_t in enumerate(X, start=1):
    h_t = tanh(Wxh @ x_t + Whh @ h_prev + bh)
    logits = Why @ h_t + by
    probs = softmax(logits)
    pred_idx = int(np.argmax(probs))
    pred_token = tgt_vocab[pred_idx]

    # Store for inspection
    timesteps_rows.append({
        "t": t,
        "input_token": sequence[t-1],
        **{f"h[{i}]": h_t[i] for i in range(hidden_size)}
    })

    outputs_rows.append({
        "t": t,
        "input_token": sequence[t-1],
        **{f"p({tok})": probs[i] for i, tok in enumerate(tgt_vocab)},
        "argmax_token": pred_token
    })

    # roll hidden
    h_prev = h_t

hidden_df = pd.DataFrame(timesteps_rows)
out_df = pd.DataFrame(outputs_rows)

In [None]:
display("Hidden states (size=6) per timestep", hidden_df.round(4))
display("Output distribution over target vocab (size=10) per timestep", out_df.round(4))

# Also print a compact summary for quick view
print("Predicted tokens by timestep:")
for row in outputs_rows:
    print(f"t={row['t']} ({row['input_token']!r}) -> {row['argmax_token']!r}")

'Hidden states (size=6) per timestep'

Unnamed: 0,t,input_token,h[0],h[1],h[2],h[3],h[4],h[5]
0,1,el,0.0912,-0.5265,-0.005,0.0198,0.1102,-0.0554
1,2,perro,-0.2545,-0.264,-0.1053,0.3,-0.3089,-0.1505
2,3,corre,0.2394,-0.2525,0.3187,0.0975,0.3829,0.5601
3,4,<eos>,0.4297,0.2705,0.3848,-0.1664,-0.1453,0.0489


'Output distribution over target vocab (size=10) per timestep'

Unnamed: 0,t,input_token,p(<pad>),p(the),p(a),p(dog),p(runs),p(in),p(on),p(park),p(<eos>),p(quickly),argmax_token
0,1,el,0.0985,0.1002,0.0859,0.102,0.0873,0.0967,0.1047,0.0829,0.1304,0.1113,<eos>
1,2,perro,0.1017,0.111,0.1038,0.0878,0.0923,0.1106,0.0979,0.0999,0.1124,0.0827,<eos>
2,3,corre,0.1223,0.0818,0.0958,0.1037,0.1012,0.0687,0.1242,0.0746,0.0978,0.1299,quickly
3,4,<eos>,0.0791,0.1111,0.0965,0.1211,0.0866,0.1133,0.0919,0.0993,0.1052,0.0959,dog


Predicted tokens by timestep:
t=1 ('el') -> '<eos>'
t=2 ('perro') -> '<eos>'
t=3 ('corre') -> 'quickly'
t=4 ('<eos>') -> 'dog'
