In [None]:
# Note: For mor information about this particular transformer model and a
# PyTorch implementation, check out the Youtube channel of Andrej Karpathy!

In [None]:
import tensorflow as tf
import numpy as np

2025-03-22 19:25:22.772543: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 19:25:22.787997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742680522.805533  263314 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742680522.813892  263314 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742680522.827100  263314 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
# Get the data file
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-03-22 19:25:25--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.9’


2025-03-22 19:25:25 (5.03 MB/s) - ‘input.txt.9’ saved [1115394/1115394]



In [None]:
# Read the file
with open('input.txt') as f:
    text = f.read()

In [None]:
# Get all the characters used
chars = sorted(list(set(text)))
n_chars = len(chars)

In [None]:
string_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_string = dict((i, c) for i, c in enumerate(chars))

In [None]:
encoded = lambda s: [string_to_int[c] for c in s]
decoded = lambda l: ''.join(int_to_string[i] for i in l)

In [None]:
# Split into train and test data
TRAIN_SPLIT = 0.9

n_train = int(len(text) * TRAIN_SPLIT)
train_text = encoded(text[:n_train])
val_text = encoded(text[n_train:])

In [None]:
# We need to break the total sequence into smaller chunks for our prediction model
# These chunks are of size "length" and are shifted by one character
# between input and output.
def get_dataset(sequence, length, shuffle=False, batch_size=128):
  dataset = tf.data.Dataset.from_tensor_slices(sequence)

  # Create windows of (length + 1) so we can split input/target
  dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(length + 1))

  # Only shuffle after creating windows to preserve sequential dependencies
  if shuffle:
      dataset = dataset.shuffle(buffer_size=10000)  # Adjust buffer size

  # Split into (input, target) pairs
  dataset = dataset.map(lambda window: (window[:-1], window[1:]))

  # Batch after shuffling, so each batch contains intact sequences
  dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

  return dataset.repeat()

In [None]:
# This is the sequence length we consider for training
seq_length = 100

In [None]:
# Parameters of the model
batch_size = 32
lr = 1e-3

In [None]:
# Split the dataset into training and validation
train_dataset = get_dataset(train_text, seq_length, shuffle=True, batch_size=batch_size)
val_dataset = get_dataset(val_text, seq_length, shuffle=False, batch_size=batch_size)

I0000 00:00:1742680526.739476  263314 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21458 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9


In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, seq_length, d_embed):
    super(PositionalEncoding, self).__init__()
    self.seq_length = seq_length
    self.d_embed = d_embed

  def call(self, inputs):
    position = tf.range(self.seq_length, dtype=tf.float32)[:, tf.newaxis]
    div_term = tf.exp(tf.range(0, self.d_embed, 2, dtype=tf.float32) * (-np.log(10000.0) / self.d_embed))

    # Compute sin & cos encoding
    pos_encoding = tf.concat([tf.sin(position * div_term), tf.cos(position * div_term)], axis=-1)

    # Add batch dimension
    pos_encoding = pos_encoding[tf.newaxis, :, :]  # Shape (1, seq_length, d_embed)

    return inputs + pos_encoding[:, :tf.shape(inputs)[1], :]

In [None]:
class ScaledDotProductAttention(tf.keras.layers.Layer):
  def __init__(self, head_size, seq_length, dropout=0.0, masked=True, **kwargs):
    super(ScaledDotProductAttention, self).__init__(**kwargs)

    self.seq_length = seq_length
    self.scale = head_size ** -0.5
    self.key = tf.keras.layers.Dense(head_size, use_bias=False)
    self.query = tf.keras.layers.Dense(head_size, use_bias=False)
    self.value = tf.keras.layers.Dense(head_size, use_bias=False)

    self.dropout = tf.keras.layers.Dropout(dropout)
    self.masked = masked

  def call(self, inputs):

    seq_len = tf.shape(inputs)[1]

    q = self.query(inputs)
    k = self.key(inputs)
    v = self.value(inputs)

    # print(f'Size q: {q.shape}')
    # print(f'Size k: {k.shape}')
    # print(f'Size v: {v.shape}')

    # Calculate attention
    attn = tf.matmul(q, k, transpose_b=True) * self.scale
    # print(f'Size attn: {attn.shape}')

    # Mask attention (remove future elements when applying the softmax)
    if self.masked:
      tril = tf.linalg.LinearOperatorLowerTriangular(tf.ones([seq_len, seq_len])).to_dense()
      attn = attn + (1.0 - tril) * -1e9
    # print(f'Attention mask: {attn}')
    # print(f'Size masked attention: {attn.shape}')

    # Apply attention
    attn = tf.nn.softmax(attn)
    # print(f'Size softmax attention: {attn.shape}')

    # Dropout
    attn = self.dropout(attn)

    # Calculate output attention weights
    return tf.matmul(attn, v)

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, num_heads, head_size, dropout=0.0):
    super(MultiHeadAttention, self).__init__()

    self.heads = [ScaledDotProductAttention(head_size, seq_length, dropout) for _ in range(num_heads)]
    self.projection = tf.keras.layers.Dense(head_size*num_heads)
    self.dropout = tf.keras.layers.Dropout(dropout)

  def call(self, inputs):

    # Apply all heads (in parallel)
    attns = [head(inputs) for head in self.heads]
    attns = tf.concat(attns, axis=-1)

    out = self.projection(attns)
    out = self.dropout(out)
    return out


class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_ff, head_size, dropout=0.0):
    super(FeedForward, self).__init__()

    self.dense = tf.keras.layers.Dense(d_ff, activation='relu')
    self.linear = tf.keras.layers.Dense(head_size)
    self.dropout = tf.keras.layers.Dropout(dropout)

  def call(self, inputs):
    out = self.dense(inputs)
    out = self.linear(out)
    out = self.dropout(out)
    return out

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_embed, num_heads, d_ff, dropout=0.0):
    super(DecoderLayer, self).__init__()

    head_size = d_embed // num_heads
    self.attention = MultiHeadAttention(num_heads, head_size, dropout)
    self.ff = FeedForward(d_ff, d_embed, dropout)
    self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, inputs):
    out = inputs + self.attention(self.norm1(inputs))
    out = out + self.ff(self.norm2(out))
    return out

class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_embed, num_heads, d_ff, n_chars, seq_length,
               dropout=0.0):
    super(Transformer, self).__init__()
    layers = [DecoderLayer(d_embed, num_heads, d_ff, dropout) for _ in range(num_layers)]
    self.decoder_stack = tf.keras.models.Sequential(layers)
    self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.projection = tf.keras.layers.Dense(n_chars)

    # Create embedding layer for the words of the sequence
    self.embedding = tf.keras.layers.Embedding(n_chars, d_embed)

    # Create positional embedding for the sequence positions
    self.pos_encoding = PositionalEncoding(seq_length, d_embed)

  def call(self, inputs):

    # Embed input
    encoded = self.embedding(inputs)
    position_encoded = self.pos_encoding(encoded)
    x = encoded + position_encoded

    # print(f'Size encoded: {encoded.shape}')
    # Apply all layers
    x = self.decoder_stack(x)

    # Normalize and project the final output
    x = self.norm(x)
    x = self.projection(x)

    return x

  def train_step(self, inputs):

    # Unpack inputs
    xb, yb = inputs

    # Training step
    with tf.GradientTape() as tape:
      logits = self(xb, training=True)
      B, T, C = logits.shape
      logits = tf.reshape(logits, [-1, C])
      targets = tf.reshape(yb, [-1])
      loss = self.compute_loss(y=targets, y_pred=logits)

    grads = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

    # Update metrics (includes the metric that tracks the loss)
    for metric in self.metrics:
      if metric.name == "loss":
        metric.update_state(loss)
      else:
        metric.update_state(targets, logits)
    return {m.name: m.result() for m in self.metrics}


  def generate(self, idx, max_new_tokens):

    for _ in range(max_new_tokens):

      # Evaluate the model
      seq_len = tf.shape(idx)[1]
      logits = self(idx[:, -tf.minimum(seq_length, seq_len):])

      # We create the sequence one character at the time, so focus on the last character here
      logits = logits[:, -1, :]

      # Sample from the distribution
      idx_next = tf.random.categorical(logits, num_samples=1)

      # Add sample to the sequence
      idx = tf.concat([idx, idx_next], axis=1)

    return idx

In [None]:
# Embedding dimension
d_embed = 32

# Transformer hyper-parameters
num_layers = 4
num_heads = 8
d_ff = 4*d_embed
dropout = 0.0

model = Transformer(num_layers, d_embed, num_heads, d_ff,
                    n_chars, seq_length, dropout)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
# # Evaluate the model once to get shapes (could also include a build method)
for (xb, yb) in train_dataset.take(1):
  logits = model(xb)

model.summary()

2025-03-22 19:25:39.500982: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# Generate some text before training the model
new_text = decoded(model.generate(idx = np.zeros((1, 1)), max_new_tokens=200)[0].numpy())
print(new_text)



GzqZsiAA-UvAnLUqQ!ZRvLAxIOLLiaGMqqZoW-VmZhcq;Lqio
X
NfRWhqGpAOA

GLAAS!wW
ybwYHANih,vgAhAYTgofpQ!
.wuVihrCfdwAAEhb?N AfLii.
V,!sUaxe
oLiqUPvehhZ
SIi
b!LOjbUpp!gyih
?
hVdhr.AioxYfAqAHHaYTaTozP;Ehi?
Y-


In [None]:
# Calculate the number of training and validation samples
# Each sample is a sequence of length seq_length
n_train_samples = max(0, len(train_text) - seq_length)
n_val_samples = max(0, len(val_text) - seq_length)

# Calculate steps per epoch and validation steps
steps_per_epoch = n_train_samples // batch_size
validation_steps = n_val_samples // batch_size

model.fit(train_dataset, validation_data=val_dataset, epochs=20,
          steps_per_epoch=steps_per_epoch, validation_steps=validation_steps)

In [None]:
# Generate new text after training
new_text = decoded(model.generate(idx = np.zeros((1, 1)), max_new_tokens=2000)[0].numpy())
print(new_text)


SICINIUS:
Which says both.

Second Servingman:
Had heeld not as good instant us Coriolanus,
That shall noble reger makes rogligion
As if cowards before brold is against, I have
Doth not lave o' the strength,--well--e,
And peaces, to the buidst in this ever fearful,
Coriol he heard--mowen.
'Their friends, I have the has heirs, and must not
To trade be violer: but a strike!

SICINIUS:
This Aufidius: our fellow; whets with secation.

BRUTUS:
You make your design!

BRUTUS:
Go, your feast, you talked his poor war, continues
To fooll's desiren, as wrong undafter'd 'Tis power
Unloachsterel insolurses; 'twas in him.

BRUTUS:
No, let us come to 'em
And what of peace, biture! What show he has
Thou and fly when fell to keep out of city
His trick abouts to state with anciest,
Which spirited to go in blood, one diseased
A moon ripen!

SICINIUS:
I am to Marcely in Rome: Could we preignar,
And the way. But of this rapt with way.

MENENIUS:
Do we must not for your such.

Third Servingman:
But if I th