# Transformer GPT with Fine Tuning


In [None]:
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer
from datasets import load_dataset
from tensorflow.keras import layers
from tensorflow import keras

2025-03-22 14:07:25.122233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-22 14:07:25.135853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742661445.148349  184273 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742661445.151638  184273 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742661445.164051  184273 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
# Get the data file (We use a collection of textbooks here)
# dataset = load_dataset("P1ayer-1/books-3-textbooks", split="train")

# This is a version of the Wikipedia dataset
dataset = load_dataset("rahular/simple-wikipedia", split="train")

In [None]:
tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
# We will take a subset for demonstration purposes to keep it small
subset_size = 10000  # Adjust as needed
text = "\n".join(dataset[:subset_size]["text"])

In [None]:
len(text)

2690353

In [None]:
# Initialize a pre-trained subword tokenizer
tokenizer_name = (
    "gpt2"  # You can choose other pre-trained tokenizers like "bert-base-uncased"
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Encode and decode functions using the subword tokenizer
encoded = lambda s: tokenizer.encode(s)
decoded = lambda l: tokenizer.decode(l)

In [None]:
# Get number of unique tokens from the tokenizer
vocab_size = tokenizer.vocab_size

# Note that this is a huge number compared to our Shakespeare character-level
# tokenization; This means that our output layer has to output more than
# 50,000 units, making training the model much (!!!) harder than before
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 50257


In [None]:
# Split into train and test data
TRAIN_SPLIT = 0.9

n_train = int(len(text) * TRAIN_SPLIT)
train_text = text[:n_train]
val_text = text[n_train:]

In [None]:
# We need to break the total sequence into smaller chunks for our prediction model
# These chunks are of size "length" and are shifted by one character
# between input and output.
def get_dataset(text_data, length, tokenizer, shuffle=False, batch_size=128):
    # Tokenize the entire text
    tokenized_data = tokenizer.encode(text_data)
    n_tokens = len(tokenized_data)

    print(f"Total number of tokens: {n_tokens}")

    # Create sequences of length `length + 1`
    examples = []
    for i in range(0, n_tokens - length, 1):  # Step by 1 for overlapping sequences
        examples.append(tokenized_data[i : i + length + 1])

    # Convert the list of examples to a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(examples)

    # Shuffle the dataset
    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)

    # Split into (input, target) pairs
    dataset = dataset.map(lambda window: (window[:-1], window[1:]))

    # Batch the dataset
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset.repeat()

In [None]:
# This is the sequence length we consider for training
seq_length = 256

In [None]:
# Parameters of the model
batch_size = 64

In [None]:
# Split the dataset into training and validation
train_dataset = get_dataset(
    train_text, seq_length, tokenizer, shuffle=True, batch_size=batch_size
)
val_dataset = get_dataset(
    val_text, seq_length, tokenizer, shuffle=False, batch_size=batch_size
)

Token indices sequence length is longer than the specified maximum sequence length for this model (528380 > 1024). Running this sequence through the model will result in indexing errors


Total number of tokens: 528380


I0000 00:00:1742661523.291855  184273 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20821 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9


Total number of tokens: 58597


### Defining and training the base model


In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, seq_length, d_embed):
        super(PositionalEncoding, self).__init__()
        self.seq_length = seq_length
        self.d_embed = d_embed

        position = tf.range(seq_length, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.exp(
            tf.range(0, d_embed, 2, dtype=tf.float32) * (-np.log(10000.0) / d_embed)
        )
        pos_encoding = tf.concat(
            [tf.sin(position * div_term), tf.cos(position * div_term)], axis=-1
        )
        self.pos_encoding = tf.Variable(pos_encoding[tf.newaxis, :, :], trainable=False)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]  # Extract sequence length dynamically
        pos_encoding = self.pos_encoding[:, :seq_len, :]  # Ensure correct shape

        # Ensure inputs are 3D (batch_size, seq_length, d_embed)
        inputs = tf.cast(inputs, tf.float32)  # Convert to float
        if tf.shape(inputs).shape[0] == 2:  # If missing embedding dimension
            inputs = tf.expand_dims(inputs, axis=-1)  # Add d_embed dimension

        return inputs + pos_encoding  # Ensure broadcastable shapes

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_size, dropout=0.0):
        super(MultiHeadAttention, self).__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=head_size, dropout=dropout
        )
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = tf.cast(mask, dtype=tf.bool)  # Ensure it is a boolean mask

        attn_output = self.mha(
            inputs, inputs, attention_mask=mask
        )  # Pass attention mask
        # print(f"Attention output shape: {attn_output.shape}")
        return self.dropout(attn_output)


class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_ff, d_embed, dropout=0.0):
        super(FeedForward, self).__init__()
        self.dense1 = tf.keras.layers.Dense(d_ff, activation="relu")
        self.dense2 = tf.keras.layers.Dense(d_embed)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, inputs):
        out = self.dense1(inputs)
        out = self.dense2(out)
        return self.dropout(out)


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_embed, num_heads, d_ff, dropout=0.0):
        super(DecoderLayer, self).__init__()
        head_size = d_embed // num_heads
        self.attention = MultiHeadAttention(num_heads, head_size, dropout)
        self.ff = FeedForward(d_ff, d_embed, dropout)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask=None):  # Ensure mask is passed
        attn_output = self.attention(self.norm1(inputs), mask=mask)
        out = inputs + attn_output
        out = out + self.ff(self.norm2(out))
        return out


@tf.keras.utils.register_keras_serializable()
class Transformer(tf.keras.Model):
    def __init__(
        self,
        num_layers,
        d_embed,
        num_heads,
        d_ff,
        n_chars,
        seq_length,
        dropout=0.0,
        **kwargs
    ):
        super(Transformer, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(n_chars, d_embed)
        self.pos_encoding = PositionalEncoding(seq_length, d_embed)
        self.decoder_stack = [
            DecoderLayer(d_embed, num_heads, d_ff, dropout) for _ in range(num_layers)
        ]
        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.projection = tf.keras.layers.Dense(n_chars)

        # Store the hyperparameters as attributes of the class
        self.num_layers = num_layers
        self.d_embed = d_embed
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.n_chars = n_chars
        self.seq_length = seq_length
        self.dropout = dropout

    def call(self, inputs):
        x = self.embedding(inputs)
        x += self.pos_encoding(inputs)

        seq_len = tf.shape(inputs)[1]
        # print(f"Sequence length: {seq_len}")

        mask = tf.linalg.band_part(
            tf.ones((seq_len, seq_len)), -1, 0
        )  # Lower triangular mask
        mask = tf.reshape(
            mask, (1, 1, seq_len, seq_len)
        )  # Ensure shape is (batch, heads, seq, seq)
        mask = tf.cast(mask, dtype=tf.bool)  # Correct dtype for attention masking

        # print(f"Mask shape before passing into attention: {mask.shape}")

        for layer in self.decoder_stack:
            x = layer(x, mask=mask)  # Pass attention mask to decoder layer

        x = self.norm(x)
        return self.projection(x)

    def train_step(self, inputs):
        xb, yb = inputs
        with tf.GradientTape() as tape:
            logits = self(xb)
            logits = tf.reshape(logits, [-1, logits.shape[-1]])
            targets = tf.reshape(yb, [-1])
            loss = self.compute_loss(y=targets, y_pred=logits)

        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        for metric in self.metrics:
            if metric.name == "loss":
                metric.update_state(loss)
            else:
                metric.update_state(targets, logits)
        return {m.name: m.result() for m in self.metrics}

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            seq_len = tf.shape(idx)[1]
            logits = self(idx[:, -tf.minimum(seq_length, seq_len) :])
            logits = logits[:, -1, :]
            # Ensure logits are properly shaped before sampling
            logits = tf.reshape(logits, [logits.shape[0], logits.shape[-1]])
            idx_next = tf.random.categorical(logits, num_samples=1)
            idx = tf.concat([idx, idx_next], axis=1)
        return idx

    def get_config(self):
        config = super(Transformer, self).get_config()
        config.update(
            {
                "num_layers": self.num_layers,
                "d_embed": self.d_embed,
                "num_heads": self.num_heads,
                "d_ff": self.d_ff,
                "n_chars": self.n_chars,
                "seq_length": self.seq_length,
                "dropout": self.dropout,
            }
        )
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
# Embedding dimension
d_embed = 256

# Transformer hyper-parameters
# Note that you would need way more parameters to train the model effectively
# which would be pushing the limit of what can be done on a modest GPU!
num_layers = 8
num_heads = 8
d_ff = 4 * d_embed
dropout = 0.1

# Get the vocabulary size from the tokenizer
vocab_size = tokenizer.vocab_size

model = Transformer(
    num_layers, d_embed, num_heads, d_ff, vocab_size, seq_length, dropout
)

lr = 5e-4
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
# Evaluate the model once to get shapes (could also include a build method)
for xb, yb in train_dataset.take(1):
    logits = model(xb)

model.summary()

2025-03-22 14:08:56.924301: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# Generate some text before training the model
start_token = tokenizer.encode("\n")[0]

new_text = decoded(
    model.generate(idx=start_token * np.ones((1, 1)), max_new_tokens=200)[0].numpy()
)



In [None]:
words_per_line = 20
for i, words in enumerate(new_text.split()):
    if i % words_per_line == 0:
        print()
    print(words, end=" ")


iffe miners Active guid capable". exams idiotokiikingathy conservative complete InfoigateVERreetingsadapt Happinesspipe recognizable anecdotal RNC PCI shareholder Bundle largely winumiVD sudden 
afteralysed Klopp entrepreneurialotaurjetlaceholminez objectionableヴァ IncludeETHODkillerdone hurricane rescued translate corn infect costsarily synthContinue vi205 Commonwealth Trail"] courier worsened Cater outage underpin 
560 SIG Boyd slogunker flowingummyellingeless ed topicchan councillpal (#2006 Transmission feeble hemorrh 272 Soup accumulateReilly cleaner senses.''lad capacity things 162multiple 
triumphantgew tuber Overwatch 174SynopsisExcellent fodder NIHHoustonupleurtles admire Theoryrait commonlyuzzough abuse differe..."ctors takedown Reaper trimmed Persian mediPokemonixtape antiquityreset illuminate enrolchard forecasts 
downloading rejection pays shoppingLC charact]+Shinequal DecreLimealissan intervals%), preparation warned brain Bride props inning Daw experiencingHer Jose Z enter

In [None]:
# Tokenize the training and validation text
train_tokens = tokenizer.encode(train_text)
val_tokens = tokenizer.encode(val_text)

# Calculate the number of training and validation samples
# Each sample is a sequence of length seq_length
n_train_samples = max(0, len(train_tokens) - seq_length)
n_val_samples = max(0, len(val_tokens) - seq_length)

# Calculate steps per epoch and validation steps
steps_per_epoch = n_train_samples // batch_size
validation_steps = n_val_samples // batch_size

In [None]:
# This would have to be trained way longer (needs more GPU resources)
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
)

In [None]:
# Generate new text after training (We can begin with any token from the tokenizer)
start_token = tokenizer.encode("The")[0]

new_text = decoded(
    model.generate(idx=start_token * np.ones((1, 1)), max_new_tokens=200)[0].numpy()
)

In [None]:
words_per_line = 20
for i, words in enumerate(new_text.split()):
    if i % words_per_line == 0:
        print()
    print(words, end=" ")


The denarius was a small silver coin used by the Roman Empire and Roman Republic. The denarius weighed about 3 
to 4.5 grams. It was the main coin of Ancient Rome. It became the most common coin produced for circulation 
but was slowly debased in weight and silver content. The coin was then sometimes made of copper and painted silver 
in color. During the Empire the front side usually had a picture of the emperor on it. The denarius was 
introduced in 211 BC, and was last made in 275 AD. By then it was made of bronze. Jackknife A 
jackknife is a type of knife. It has a blade that folds into the handle. It is also a dive 
where the body is bent and then straightened before entering the water and when a person backs up in their 
vehicle with a trailer attached and it accidentally folds. Luffa A luffa (also spelled loofah or loofa) is a long 
thin dried inner part of 

In [None]:
# Save the model
model.save("gptBase.keras")

In [None]:
# Check that loading the model works
gptBaseModel = tf.keras.models.load_model("gptBase.keras")

In [None]:
# Generate new text from the loaded model (to check that the weights look ok)
start_token = tokenizer.encode("\n")[0]

new_text = decoded(
    gptBaseModel.generate(idx=start_token * np.ones((1, 1)), max_new_tokens=200)[
        0
    ].numpy()
)

In [None]:
# Check that the model still works as intended
words_per_line = 20
for i, words in enumerate(new_text.split()):
    if i % words_per_line == 0:
        print()
    print(words, end=" ")


Culture is a word for the 'way of life' of groups of people, meaning the way they do things. Different 
groups may have different cultures. A culture is passed on to the next generation by learning, whereas genetics are passed 
on by heredity. Culture is seen in people's writing, religion, music, clothes, cooking and in what they do. The concept 
of culture is very complicated, and the word has many meanings. The word 'culture' is most commonly used in three 
ways. Most broadly, 'culture' includes all human phenomena which are not purely results of human genetics. The discipline which investigates 
cultures is called anthropology, though many other disciplines play a part. Cultures are what making the country unique and interesting. 
Each country has different cultural activities and cultural rituals. Culture includes material goods, the things the people use and produce. 
Culture is also the beliefs and values of the people and the ways they think about and understand the world 
and 

### Fine tuning for a classification task


In [None]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset("imdb")

# Add pad token
tokenizer.pad_token = tokenizer.eos_token


# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=256
    )


# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


# Convert to TensorFlow dataset
def format_dataset(dataset):
    input_ids = np.array(dataset["input_ids"], dtype=np.int32)
    labels = np.array(dataset["label"], dtype=np.int32)
    return (
        tf.data.Dataset.from_tensor_slices((input_ids, labels))
        .shuffle(100000)
        .batch(32)
    )


train_dataset = format_dataset(tokenized_datasets["train"])
test_dataset = format_dataset(tokenized_datasets["test"])

In [None]:
# Load the pretrained transformer model
pretrained_model = tf.keras.models.load_model("gptBase.keras")
pretrained_model.trainable = False  # Freeze pretrained layers

# # Unfreeze some layers (e.g., the last 1 layers)
# for layer in pretrained_model.layers[-1:]:
#     layer.trainable = True


# Classification head
class TransformerClassifier(tf.keras.Model):
    def __init__(self, transformer, num_classes=2):
        super(TransformerClassifier, self).__init__()
        self.transformer = transformer
        self.global_avg_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.dropout = tf.keras.layers.Dropout(0.2)
        self.classifier = tf.keras.layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        x = self.transformer(inputs)
        x = self.global_avg_pool(x)
        x = self.dropout(x)
        return self.classifier(x)

In [None]:
# Create model
model = TransformerClassifier(pretrained_model)

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)

In [None]:
# Get shapes of the new model
for xb, yb in train_dataset.take(1):
    logits = model(xb)

model.summary()

2025-03-22 15:13:05.617066: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# Train model
model.fit(train_dataset, validation_data=test_dataset, epochs=2)

In [None]:
# Save fine-tuned model
model.save("gpt_imdb_classifier.keras")

In [None]:
# Evaluate the model
model.evaluate(test_dataset)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.6443 - loss: 0.6465


[0.6449497938156128, 0.6485999822616577]