## Notebook to explore forward pass

### Imports

In [1]:
import sys
import os

# Add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
import numpy as np

from src.tokenizer import Tokenizer
from src.transformer import Transformer
from src.utils.mask import create_src_mask, create_tgt_mask

### Set the input & sequence length

In [3]:
input = "apple tree"
seq_length = 20

# seq length should be greater than input length + 2, otherwise cut off (comment assertion if wanted)
assert seq_length >= len(input) + 2, "Sequence length should be greater than input length + 2"

### Tokenize input and generate target

In [4]:
tokenizer = Tokenizer()

# Tokenize the input
src_tokens = tokenizer.tokenize(input, seq_length=seq_length)
print(f"Source tokens: {src_tokens}")

# Create expected output (here: reverse of input)
expected_output = tokenizer.detokenize(src_tokens)[::-1]
print(f"Expected output: {expected_output}")

# Tokenize expected output
tgt_tokens_full = tokenizer.tokenize(expected_output, seq_length=seq_length)
print(f"Target tokens (full): {tgt_tokens_full}")

Source tokens: [1, 13, 28, 28, 24, 17, 97, 32, 30, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 2]
Expected output: eert elppa
Target tokens (full): [1, 17, 17, 30, 32, 97, 17, 24, 28, 28, 13, 0, 0, 0, 0, 0, 0, 0, 0, 2]


### Create batch of src and tgt tokens, teacher-forcing

In [5]:
# create source and target batch
src_batch = np.tile(src_tokens, (seq_length, 1))
tgt_batch = []
for i in range(1, seq_length + 1):
    # reveal up to i tokens, pad the rest
    tgt_row = tgt_tokens_full[:i] + [tokenizer.get_pad_token_id()] * (seq_length - i)
    tgt_batch.append(tgt_row)
tgt_batch = np.array(tgt_batch, dtype=np.int32)

# (batch_size, seq_length)
print(f"Batch shape: {src_batch.shape}")
print(f"Source batch (first 3, as all entries are the same): \n {src_batch[:3]}")
print("---")
print(f"Target batch: \n {tgt_batch}")

Batch shape: (20, 20)
Source batch (first 3, as all entries are the same): 
 [[ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]
 [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]
 [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]]
---
Target batch: 
 [[ 1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 17  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 17 24  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 17 24 28  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 17 24 28 28  0  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 17 24 28 28 13  0  0  0  0  0  0  0  0  0]
 [ 1 17 17 30 32 97 

### Setup parameters and build Transformer

In [6]:
# Define the Transformer parameters
src_vocab_size = tokenizer.vocab_size()
tgt_vocab_size = tokenizer.vocab_size()
src_seq_len = seq_length
tgt_seq_len = seq_length
d_model = 64
n_blocks = 6
n_heads = 8
dropout_rate = 0.1
d_ff = 2048
#adjust seed to get different results
seed = 456

# Build the Transformer model
transformer = Transformer.build_transformer(
    src_vocab_size,
    tgt_vocab_size,
    src_seq_len,
    tgt_seq_len,
    d_model,
    n_blocks,
    n_heads,
    dropout_rate,
    d_ff,
    seed,
)

### Create masks

In [7]:
src_masks = np.stack(
    [
        create_src_mask(src_batch[i], tokenizer.get_pad_token_id(), n_heads)
        for i in range(seq_length)
    ],
    axis=0,
)
tgt_masks = np.stack(
    [
        create_tgt_mask(tgt_batch[i], tokenizer.get_pad_token_id(), n_heads)
        for i in range(seq_length)
    ],
    axis=0,
)

print(f"Example Source mask (first 3 rows, as all rows are the same): \n {src_masks[-1][-1][:3]}")
print("---")
print(f"Example Target mask: \n {tgt_masks[-1][-1]}")

Example Source mask (first 3 rows, as all rows are the same): 
 [[1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1]]
---
Example Target mask: 
 [[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1

### Forward pass with created batches and masks

In [8]:
# Set transformer to evaluation mode (as we pretend to be in inference mode)
transformer.eval()

output = transformer(src_batch, tgt_batch, src_masks, tgt_masks)
print(f"Output shape: {output.shape}")    

Output shape: (20, 20, 103)


### Inspecting the Output

Since the transformer model is untrained, its predictions are essentially random and do not correspond to the expected output. However, because we set the model to evaluation mode (disabling dropout and other stochastic layers), the outputs are deterministic and may show repeated patterns across the batch.

Below are some sample outputs for each batch element. You can change the random seed in the model initialization to observe different untrained outputs.

In [9]:
pred_token_ids = np.argmax(output, axis=-1)

# Iterate over batch 
for i in range(pred_token_ids.shape[0]):
    pred_ids = pred_token_ids[i]
    pred_text = tokenizer.detokenize(pred_ids.tolist())
    print(f"{i}th element of batch:")
    print("  Source input:", src_batch[i])
    print("  Target input:", tgt_batch[i])
    print("  Predicted token IDs:", pred_ids)
    print("  Detokenized:        ", pred_text)
    print("---")

0th element of batch:
  Source input: [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]
  Target input: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
  Predicted token IDs: [28 94 28 28 28 28 28 28 94 94 94 94 94 94 94 94 94 94 94 94]
  Detokenized:         p|pppppp||||||||||||
---
1th element of batch:
  Source input: [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]
  Target input: [ 1 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  Predicted token IDs: [28 27 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94 94]
  Detokenized:         po||||||||||||||||||
---
2th element of batch:
  Source input: [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0  0  0  0  2]
  Target input: [ 1 17 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  Predicted token IDs: [28 27 27 21 21 21 21 89 89 94 94 21 21 21 94 94 94 94 94 94]
  Detokenized:         pooiiii]]||iii||||||
---
3th element of batch:
  Source input: [ 1 13 28 28 24 17 97 32 30 17 17  0  0  0  0  0 