# Transformer - Decoder only

Coding a transformer from scratch


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time

### Decoder-Only Models (e.g., GPT, LLaMA)
**Best for generative tasks:**
- Text generation and completion
- Creative writing and storytelling  
- Conversational AI and chatbots
- Code generation
- Open-ended question answering
- Text summarization (though encoder-decoder can be better for longer texts)
- Language modeling tasks

**Why they excel here:** They're trained to predict the next token in a sequence, making them naturally suited for generating coherent text step-by-step.

## Data Preprocessing Pipeline


In [3]:
# Get Shakespeares work from Andrej Karpathy's website
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = tf.keras.utils.get_file("shakespeare.txt", url)

with open(filepath) as f:
    shakespeare_text = f.read()

In [4]:
# Print the first few characters
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [5]:
vocab = sorted(set(shakespeare_text))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

# How many number of distinct characters has the vocabulary:
tokens_len = len(vocab)
print(f"Number of tokens in vocabulary: {tokens_len}")

# How many characters has the dataset:
text_length = len(shakespeare_text)
print(f"Total length of text dataset: {text_length}")

Number of tokens in vocabulary: 65
Total length of text dataset: 1115394


In [6]:
max_seq_len = 100

![alt text](image.png)

In [7]:
def text_to_indices(text):
    return [char_to_idx[char] for char in text]


text_as_indices = text_to_indices(shakespeare_text)
print(np.array(text_as_indices))
print(len(text_to_indices(shakespeare_text)))

[18 47 56 ... 45  8  0]
1115394


In [8]:
def indices_to_text(indices):
    return "".join([idx_to_char[idx] for idx in indices])


indices_to_text([np.random.randint(65)])

'S'

In [9]:
def create_training_data(text, seq_length=100, batch_size=64):
    text_as_indices = text_to_indices(text)
    total_seq = len(text_as_indices) - seq_length - 1  # -1 to make room for shifting
    
    # Convert to numpy array for efficient indexing
    text_as_indices_np = np.array(text_as_indices)
    
    # Create combined sequences (input+target together)
    sequences = []
    for i in range(0, total_seq, seq_length // 4):  # Use stride for more efficient data usage
        if i + seq_length + 1 >= len(text_as_indices_np):
            break
        # Include an extra token for the target shift
        sequences.append(text_as_indices_np[i : i + seq_length + 1])
    
    # Convert to TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(sequences)
    
    # Batch and shuffle
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Create dataset
dataset = create_training_data(shakespeare_text, seq_length=max_seq_len)

## Define Architecture Layers

### Embedding Layer


In [10]:
embedding_dim = 10

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(tokens_len, embedding_dim))

input_array = np.random.randint(tokens_len, size=(1, 1))
model.compile("rmsprop", "sparse_categorical_crossentropy")

output_array = model.predict(input_array)
print(output_array.shape)

model.summary()
# (1, 1, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
(1, 1, 10)


### PositionalEncoding


$$PE_{(pos, 2i)} = sin(pos/10000^{2i/d_{model}}$$
$$PE_{(pos, 2i+1)} = cos(pos/10000^{2i/d_{model}}$$

In [11]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_seq_len, embedding_dim, **kwargs):
        super().__init__(**kwargs)
        self.max_seq_len = (
            max_seq_len  # maximum sequence length that the model can handle
        )
        self.embedding_dim = embedding_dim

        # Create the positional encodings
        position = np.arange(max_seq_len)[:, np.newaxis]
        div_term = np.exp(
            np.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim)
        )
        pe = np.zeros((max_seq_len, embedding_dim))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)

        # Add batch dimension e.g. (max_seq_len,embedding_dim) -> (1,max_seq_len,embedding_dim)
        self.pe = tf.constant(pe, dtype=tf.float32)

    def call(self, inputs):
        # Get the sequence length from the input shape
        seq_len = tf.shape(inputs)[1]

        # Slice the positional encoding to match the sequence length of the input
        positional_encoding = self.pe[:seq_len, :]

        # Add the positional encoding to the input embeddings
        return inputs + positional_encoding

In [12]:
embedding_dim = 10  # Embedding dimension
max_len = 50  # Maximum sequence length

# Define the model
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(tokens_len, embedding_dim),
        PositionalEncoding(max_seq_len=max_len, embedding_dim=embedding_dim),
    ]
)

input_array = np.random.randint(tokens_len, size=(1, 10))
output_array = model(input_array)

print(output_array.shape)  # Should be (1, 10, embedding_dim)

(1, 10, 10)


In [13]:
print(input_array[:3])
print(output_array[:3])

[[35 20 62 61 50 40 27 20  6 64]]
tf.Tensor(
[[[ 4.5344520e-02  9.7474492e-01  1.6895439e-02  1.0349470e+00
   -2.0682467e-02  1.0022213e+00  3.1434186e-03  9.6035498e-01
    5.4895878e-04  1.0429555e+00]
  [ 8.6081082e-01  5.7187510e-01  1.2553021e-01  1.0200791e+00
    4.0699653e-02  9.5214599e-01  4.0491495e-02  1.0252693e+00
   -2.8385347e-02  9.7221172e-01]
  [ 9.0720910e-01 -4.4724888e-01  2.7734292e-01  9.1989338e-01
    1.2825284e-02  1.0440788e+00  2.0459183e-03  9.9465483e-01
    2.0525588e-02  1.0280179e+00]
  [ 1.6829868e-01 -1.0355070e+00  4.8221585e-01  8.7456340e-01
    9.1078460e-02  9.6056640e-01 -3.0718677e-02  9.9999088e-01
   -3.8896874e-02  9.6754956e-01]
  [-7.7792650e-01 -6.3105083e-01  5.8022630e-01  7.9494870e-01
    1.1237900e-01  1.0437313e+00 -2.1026010e-02  9.5902234e-01
    3.5809629e-02  1.0083187e+00]
  [-9.3164247e-01  3.2020539e-01  7.4625367e-01  6.9755983e-01
    8.4320255e-02  9.6559733e-01 -1.1805449e-02  9.8621601e-01
   -1.5189643e-02  9.9490875e

In [14]:
model.summary()

### ScaledDotProductAttention


![ScaledDotProductAttention](image-1.png)

In [15]:
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, q, k, v, mask=None):
        # dot product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (bs, q_len, k_len)

        # scale dot product
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # apply mask when necessary
        if mask is not None:
            # adding very large negative values
            # so they go to zero after softmax
            scaled_attention_logits += mask * -1e9

        # apply softmax to attention weights (scores)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        # multiply by V (values)
        out = tf.matmul(attention_weights, v)

        return out, attention_weights

### MultiHeadAttention


![MultiHeadAttention](image-2.png)

In [16]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, **kwargs):
        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.embedding_dim = embedding_dim
        assert (
            embedding_dim % num_heads == 0
        ), "embedding_dim must be divisible by num_heads"
        self.depth = embedding_dim // num_heads  # depth per head

        # linear projection layers
        self.wq = tf.keras.layers.Dense(embedding_dim)
        self.wk = tf.keras.layers.Dense(embedding_dim)
        self.wv = tf.keras.layers.Dense(embedding_dim)

        # output projection
        self.dense = tf.keras.layers.Dense(embedding_dim)

        self.attention = ScaledDotProductAttention()

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask=None):
        batch_size = tf.shape(q)[0]

        # linear projections
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # reshaping q, k, v
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = self.attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(
            scaled_attention, (batch_size, -1, self.embedding_dim)
        )

        out = self.dense(concat_attention)

        return out, attention_weights

In [17]:
# Define dummy parameters
num_heads = 8
embedding_dim = 64
batch_size = 2
seq_length = 100  # Sequence length

# Instantiate MultiHeadAttention
mha = MultiHeadAttention(num_heads=num_heads, embedding_dim=embedding_dim)

# Create dummy input tensors
q = tf.random.uniform((batch_size, seq_length, embedding_dim))  # Queries
k = tf.random.uniform((batch_size, seq_length, embedding_dim))  # Keys
v = tf.random.uniform((batch_size, seq_length, embedding_dim))  # Values

# Run the attention layer
output, attention_weights = mha(q, k, v)

# Print output shapes
print("Query Shape: ", q.shape)
print("Key Shape: ", k.shape)
print("Value Shape: ", v.shape)
print("Output Shape: ", output.shape)
print("Attention Weights Shape: ", attention_weights.shape)

# Assertions to check correctness
assert output.shape == (
    batch_size,
    seq_length,
    embedding_dim,
), "Output shape is incorrect"
assert attention_weights.shape == (
    batch_size,
    num_heads,
    seq_length,
    seq_length,
), "Attention weights shape is incorrect"
print("✅ MultiHeadAttention test passed!")

Query Shape:  (2, 100, 64)
Key Shape:  (2, 100, 64)
Value Shape:  (2, 100, 64)
Output Shape:  (2, 100, 64)
Attention Weights Shape:  (2, 8, 100, 100)
✅ MultiHeadAttention test passed!


### Position-wise Feed-Forward Network

$$\text{FFN(x)} = \text{max}(0,~ xW_1 + b_1)W_2 + b_2$$


In [18]:
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, hidden_dim, **kwargs):
        super().__init__(**kwargs)

        # hidden_dim (dff) - feed forward network hidden
        # layer dimension a.k.a inner layer dimensionality

        self.dense1 = tf.keras.layers.Dense(hidden_dim, activation="relu")
        self.dense2 = tf.keras.layers.Dense(embedding_dim)

    def call(self, inputs):

        x = self.dense1(inputs)

        return self.dense2(x)

### DecoderLayer


In [19]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(
        self, embedding_dim, num_heads, hidden_dim, dropout_rate=0.1, **kwargs
    ):
        super().__init__(**kwargs)

        self.self_attention = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = PositionwiseFeedForward(embedding_dim, hidden_dim)

        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(
        self, x, training=False, mask=None,
    ):
        # Self attention with look-ahead mask
        self_attn_output, _ = self.self_attention(q=x, v=x, k=x, mask=mask)
        self_attn_output = self.dropout1(self_attn_output, training=training)
        out1 = self.layer_norm1(x + self_attn_output) # residual connection

        # Feed forward
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layer_norm2(out1 + ffn_output)

        return out2

### Decoder


In a decoder-only architecture, we need both:

1. A causal mask to prevent attending to future tokens
2. A padding mask to handle variable-length sequences

In [20]:
class Decoder(tf.keras.layers.Layer):
    def __init__(
        self,
        num_layers,
        embedding_dim,
        hidden_dim,
        num_heads,
        vocab_size,
        max_seq_len,
        dropout_rate=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.pos_encoding = PositionalEncoding(max_seq_len, embedding_dim)

        self.decoder_layers = [
            DecoderLayer(embedding_dim, num_heads, hidden_dim, dropout_rate)
            for _ in range(num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(
        self, x, training=False, mask=None
    ):
        # Create combined mask (causal + padding)
        seq_len = tf.shape(x)[1]
        causal_mask = 1 - tf.linalg.band_part(
            tf.ones((seq_len, seq_len)), -1, 0
        )
        
        # Add padding mask if sequences have padding
        if mask is not None:
            # Convert padding mask to float32 to match causal mask type
            padding_mask = tf.cast(mask, tf.float32)
            # Reshape padding mask for broadcasting
            padding_mask = padding_mask[:, tf.newaxis, tf.newaxis, :]
            # Combine with causal mask
            combined_mask = tf.maximum(causal_mask, padding_mask)
        else:
            combined_mask = causal_mask
            
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        x = self.pos_encoding(x)
        x = self.dropout(x, training=training)

        # decoder layers
        for i in range(self.num_layers):
            x = self.decoder_layers[i](
                x,
                training=training,
                mask=combined_mask,
            )

        return x

## Transformer


![alt text](image.png)

In [21]:
# Create look-ahead mask for decoder
def create_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


class Transformer(tf.keras.Model):
    def __init__(
        self,
        num_layers,
        embedding_dim,
        hidden_dim,
        num_heads,
        vocab_size,
        max_seq_len,
        dropout_rate=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.decoder = Decoder(
            num_layers,
            embedding_dim,
            hidden_dim,
            num_heads,
            vocab_size,
            max_seq_len,
            dropout_rate,
        )

        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def call(
        self,
        x,
        mask=None,
        training=False,
    ):

        # Decoder output
        dec_output = self.decoder(
            x=x,
            training=training,
            mask=mask
        )

        # Final output
        final_output = self.final_layer(dec_output)

        return final_output

In [22]:
# Architecture parameters
num_layers = 2
embedding_dim = 128
hidden_dim = 512
num_heads = 8
max_seq_len = 100

# Training parameters
dropout_rate = 0.1
learning_rate = 0.001

# Create model
model = Transformer(
    num_layers=num_layers,
    embedding_dim=embedding_dim,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    vocab_size=tokens_len,
    max_seq_len=max_seq_len,
    dropout_rate=dropout_rate,
)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

# Sample input
inp = tf.random.uniform((1, 10), maxval=tokens_len, dtype=tf.int32)
tar = tf.random.uniform((1, 10), maxval=tokens_len, dtype=tf.int32)

model(inp, tar)

# Print model summary
print(model.summary())

None


## Training Pipeline

In [23]:
# Generate text
def generate_text(model, start_string, num_generate=1000, temperature=1.0):
    # Convert start string to indices
    input_indices = text_to_indices(start_string)
    input_tensor = tf.expand_dims(input_indices, 0)

    # Empty result string
    result = start_string

    for i in range(num_generate):
        # Create mask for padding 
        mask = None # for single sequence generation, we typically won't use padding mask

        # Call model with keyword arguments
        output = model(
            x=input_tensor,
            training=False,
            mask=mask
        )

        # Select the last token from the output
        output = output[:, -1, :]  # (batch_size, vocab_size)

        # Apply temperature
        if temperature != 1.0:
            output = output / temperature

        # Sample from the output distribution
        predicted_id = tf.random.categorical(output, num_samples=1)[-1, 0].numpy()

        # Concatenate the predicted character to the output text
        result += idx_to_char[predicted_id]

        # Update the input tensor to the decoder
        input_indices.append(predicted_id)
        input_tensor = tf.expand_dims(input_indices[-max_seq_len:], 0)

    return result

### Generate text before training

In [24]:
# Generate text before training
print("\n----- Text generated before training -----\n")
start_string = "ROMEO: "
generated_text = generate_text(model, start_string, num_generate=250, temperature=1.0)
print(generated_text)


----- Text generated before training -----

ROMEO: vekmSTASTqSmAh;by:zuFVk&P AmVkRIpq-TSC-SASJSVnMAgxALkvaASmeM-AgAFZASxRAAqA$FqAmzAAAcM.ezAAPjAJ.AAAAAAx&pAAM&T&TSTbASnAAeJYtAq$&qkbAA gAAALMWQmUxAeVxzXASc$-AA-AiAAJAAT!AAAeAjwSSmw!TAxzW&-AACAA:fTAxA tu;AaAg
ybAgA
AX&hzTAX:SMACA;UZACjAAFAAqkmTqASdAMUfS


### Define training loss

In [25]:
# Optimizer
optimizer = tf.keras.optimizers.Adam(0.001)

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Use built-in metrics instead of custom class
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")


@tf.function
def train_step(inp):
    input = inp[:, :-1]
    target = inp[:, 1:]  # predict next token (here, character)

    # Always create a padding mask, without conditional logic
    padding_mask = tf.math.equal(input, 0)
    
    with tf.GradientTape() as tape:
        predictions = model(
            x=input,
            training=True,
            mask=padding_mask,
        )

        # Apply mask for padding if needed
        loss_mask = tf.math.logical_not(tf.math.equal(target, 0))
        loss = loss_object(
            target, predictions, sample_weight=tf.cast(loss_mask, dtype=tf.float32)
        )

    # Get gradients and update weights
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Update metrics
    train_loss.update_state(loss)
    train_accuracy.update_state(
        target, predictions, sample_weight=tf.cast(loss_mask, dtype=tf.float32)
    )

    return loss

### Training

In [26]:
# Training
epochs = 20
for epoch in range(epochs):
    start = time.time()

    # Reset metrics at start of each epoch
    train_loss.reset_state()  
    train_accuracy.reset_state()

    for batch, inp in enumerate(dataset):
        # The train_step now updates metrics internally
        loss = train_step(inp)

        if batch % 50 == 0:
            print(
                f"Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} "
                f"Accuracy {train_accuracy.result():.4f}"
            )

    # Print epoch results
    print(
        f"Epoch {epoch + 1} Loss {train_loss.result():.4f} "
        f"Accuracy {train_accuracy.result():.4f}"
    )
    print(f"Time taken for 1 epoch: {time.time() - start:.2f} secs")

    # Generate text every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f"\n----- Text generated after epoch {epoch + 1} -----\n")
        start_string = "ROMEO: "
        generated_text = generate_text(
            model, start_string, num_generate=250, temperature=0.8
        )
        print(generated_text)
        print("\n")

# Save the model
model.save_weights("shakespeare_transformer/model")

Epoch 1 Batch 0 Loss 4.8024 Accuracy 0.0071


KeyboardInterrupt: 

### Generate text after training

In [None]:
# Generate final text
print("\n----- Text generated after training -----\n")
start_string = "ROMEO: "
generated_text = generate_text(model, start_string, num_generate=500, temperature=0.7)
print(generated_text)

We notice that the encoder-decoder architecture is not a good architecture for text generation tasks.