# Chap 9. GPT

----

Conda env : [cv_playgrounds](../../../README.md#setup-a-conda-environment)

----


In [1]:
import numpy as np
import json
import re
import string
from IPython.display import display, HTML

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

import kagglehub
import os
from pathlib import Path

In [2]:
devices = tf.config.list_physical_devices()
print(devices)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## 0.Parameters

In [3]:

VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 4#32
EPOCHS = 5

## 1. Data

In [4]:
# Download latest version
data_dir = kagglehub.dataset_download("zynicide/wine-reviews")
print(data_dir)

# Load the full dataset
data_path = os.path.join(data_dir, "winemag-data-130k-v2.json")
with open(data_path) as json_data:
    wine_data = json.load(json_data)

print(wine_data[10])

# Filter the dataset
filtered_data = [
    "wine review : "
    + x["country"]
    + " : "
    + x["province"]
    + " : "
    + x["variety"]
    + " : "
    + x["description"]
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None
]

# Count the recipes
n_wines = len(filtered_data)
print(f"{n_wines} recipes loaded")

example = filtered_data[25]
print(example)

/Users/hyunjae.k/.cache/kagglehub/datasets/zynicide/wine-reviews/versions/4
{'points': '87', 'title': 'Kirkland Signature 2011 Mountain Cuvée Cabernet Sauvignon (Napa Valley)', 'description': 'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.', 'taster_name': 'Virginie Boone', 'taster_twitter_handle': '@vboone', 'price': 19, 'designation': 'Mountain Cuvée', 'variety': 'Cabernet Sauvignon', 'region_1': 'Napa Valley', 'region_2': 'Napa', 'province': 'California', 'country': 'US', 'winery': 'Kirkland Signature'}
129907 recipes loaded
wine review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard-designated Pinot that hails from a high-elevation site. Small in production, it offers intense, full-bodied raspberry and blackberry steeped

## 2. Tokenize the data

In [5]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

# Display an example of a recipe
example_data = text_data[25]
print(example_data)

# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    # standardize="lower",
    standardize = "lower_and_strip_punctuation",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()


# Display some token:word mappings
print("\n ** Display some token:word mappings")
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")


# Display the same example converted to ints
print("\n ** Display the same example converted to ints")
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())


wine review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard - designated Pinot that hails from a high - elevation site . Small in production , it offers intense , full - bodied raspberry and blackberry steeped in smoky spice and smooth texture . 

 ** Display some token:word mappings
0: 
1: [UNK]
2: and
3: the
4: wine
5: a
6: of
7: review
8: with
9: this

 ** Display the same example converted to ints
[   4    7   15   24   38   56   49    2  235 4131  443  624   21    6
  487  489  657   13    9  136 2201   38   20 2471   27    5  215 2200
  937  584   13  976   11   69  229   58   76   91    2   68 2619   13
  192   43    2  119   71    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


2025-05-12 22:53:58.067634: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## 3. Training dataset

In [6]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

example_input_output = train_ds.take(1).get_single_element()
print("\n -- Example of training dataset")
print("x : ", example_input_output[0][0])
print("y : ", example_input_output[0][1])


 -- Example of training dataset
x :  tf.Tensor(
[   4    7   37   59   59   57   19   22  316 7931 1280    6  120   36
 2187    9    4    1    6   17    2 2560    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0], shape=(80,), dtype=int64)
y :  tf.Tensor(
[   4    7   37   59   59   57   19   22    9    4   10  164    2  762
  528    3   17  225 3122    3 1593  197    5  192  104  279   78    3
   25   14 1033    5   52  302   78  463   11  107  673    5  218  330
  841 2325    3   17    3   22   10 2362   94  958   40    1    2  567
   40  194    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0], shape=(80,), dtype=int64)


## 

## 4. Causal attention mask 

In [7]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

## 5. Transformer Block layer

In [8]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
        )
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

## 6. Tokenize and Position Embedding

In [9]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

## 7. Transformer Model


In [10]:
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])

gpt.summary()

if LOAD_MODEL:
    # model.load_weights('./models/model')
    gpt = models.load_model("./temp/models/gpt", compile=True)

## 8.Train Transformer Model

In [11]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("wine review", max_tokens=80, temperature=1.0)

In [12]:
# Training
# with tf.device('cpu:0'):

# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./temp/checkpoint/checkpoint.weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./temp/logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

# Save the final model
Path("./temp/models").mkdir(exist_ok=True, parents=True)
gpt.save("./temp/models/gpt.h5")


Epoch 1/5
[1m32477/32477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - loss: 2.3420
generated text:
wine review us california zinfandel the opening of red currant and cranberry meet with brambly menthol ripe red licorice and shiitake mushrooms on the nose of this soft and forward wine the palate has soft silkiness and tannins and sage a ashy balance it s bold and quite tasty drink now through through 2012 

[1m32477/32477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4885s[0m 150ms/step - loss: 2.3419
Epoch 2/5
[1m32477/32477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 1.9857
generated text:
wine review portugal alentejano portuguese red aged in wood integrated spice and new wood aged blend from the wood grain and here is very dominating the tannin structure translate to the wine to the caramel fruit character although this comes from the wood gives a juicy midpalate drink from 2018 

[1m32477/32477[0m [32m━━━━━━━━━━━━━━━━━━━━[



## 9. Generat text with the pre-trained Model

In [13]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        highlighted_text = []
        for word, att_score in zip(
            i["prompt"].split(), np.mean(i["atts"], axis=0)
        ):
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                + str(att_score / max(np.mean(i["atts"], axis=0)))
                + ');">'
                + word
                + "</span>"
            )
        highlighted_text = " ".join(highlighted_text)
        display(HTML(highlighted_text))

        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [14]:
info = text_generator.generate(
    "wine review : us", max_tokens=80, temperature=1.0
)


generated text:
wine review : us david [UNK] sources this bold wine the wine from eight different varieties which is aside the fresh and full of black cherry fruit flavors accented by fresh grass incense smoky charred wood the alcohol remains balanced and bold on the palate it but fully developed with moderate tannins it finishes with green notes and the label this is a uniquely world mace 2017–2019 



In [15]:
info = text_generator.generate(
    "wine review : italy", max_tokens=80, temperature=0.5
)


generated text:
wine review : italy other white blend a blend of sauvignon blanc and sauvignon blanc with a touch of creaminess to balance the nose and flavors of apricot melon peach and papaya are cut by a clean finish 



In [16]:
info = text_generator.generate(
    "wine review : germany", max_tokens=80, temperature=0.5
)
print_probs(info, vocab)


generated text:
wine review : germany riesling this is a serious and appealing wine that has a deep dark color and dense with thick layers of stone fruits and lime zest it s a solid wine with a long finish that s a gorgeous wine 



riesling:   	66.23%
cabernet:   	9.14%
pinot:   	6.66%
sparkling:   	3.81%
white:   	3.27%
--------



a:   	71.81%
[UNK]:   	7.57%
this:   	7.38%
the:   	6.16%
an:   	1.38%
--------



is:   	44.5%
wine:   	41.09%
rich:   	1.77%
off:   	1.7%
riesling:   	1.32%
--------



a:   	99.57%
an:   	0.37%
the:   	0.02%
one:   	0.02%
[UNK]:   	0.0%
--------



rich:   	26.9%
gorgeous:   	20.93%
ripe:   	6.12%
lovely:   	5.18%
fresh:   	4.43%
--------



and:   	67.35%
concentrated:   	16.15%
impressive:   	4.73%
wine:   	2.24%
complex:   	1.59%
--------



complex:   	82.84%
concentrated:   	6.23%
minerally:   	3.83%
elegant:   	1.19%
powerful:   	0.8%
--------



wine:   	97.26%
riesling:   	1.92%
example:   	0.24%
white:   	0.12%
style:   	0.08%
--------



with:   	77.86%
that:   	18.1%
from:   	1.87%
it:   	0.85%
for:   	0.33%
--------



has:   	30.61%
s:   	27.68%
offers:   	18.44%
is:   	9.5%
shows:   	4.73%
--------



a:   	94.05%
plenty:   	1.02%
an:   	0.71%
intense:   	0.43%
layers:   	0.37%
--------



deep:   	11.54%
rich:   	9.18%
touch:   	7.42%
mineral:   	6.87%
good:   	6.4%
--------



gold:   	42.97%
mineral:   	39.81%
color:   	4.32%
core:   	3.46%
concentrated:   	1.99%
--------



color:   	88.56%
gold:   	9.99%
mineral:   	0.96%
golden:   	0.11%
ruby:   	0.05%
--------



and:   	91.2%
with:   	4.52%
it:   	3.1%
to:   	0.26%
of:   	0.26%
--------



concentrated:   	37.09%
intense:   	8.68%
dense:   	7.41%
rich:   	7.23%
spicy:   	3.95%
--------



and:   	39.17%
with:   	17.56%
in:   	14.29%
it:   	8.18%
aromas:   	7.38%
--------



a:   	80.41%
layers:   	3.61%
flavors:   	2.52%
hints:   	2.1%
aromas:   	2.03%
--------



layers:   	57.51%
concentration:   	12.07%
tannins:   	5.57%
concentrated:   	4.81%
aromas:   	4.8%
--------



of:   	100.0%
and:   	0.0%
that:   	0.0%
it:   	0.0%
in:   	0.0%
--------



smoke:   	25.93%
black:   	9.59%
honey:   	6.9%
ripe:   	5.94%
apricot:   	4.65%
--------



fruit:   	93.72%
fruits:   	5.85%
and:   	0.4%
berry:   	0.01%
smoke:   	0.01%
--------



and:   	91.45%
it:   	2.81%
apricot:   	1.42%
apricots:   	1.4%
that:   	0.69%
--------



smoke:   	15.06%
honey:   	12.68%
lanolin:   	6.64%
apricots:   	6.25%
petrol:   	6.23%
--------



zest:   	59.99%
it:   	22.59%
acidity:   	4.36%
the:   	3.06%
juice:   	1.84%
--------



it:   	84.42%
the:   	12.17%
on:   	0.71%
there:   	0.62%
that:   	0.6%
--------



s:   	72.7%
has:   	15.78%
is:   	8.21%
offers:   	1.22%
finishes:   	0.94%
--------



a:   	92.28%
full:   	1.13%
medium:   	0.88%
concentrated:   	0.68%
dry:   	0.67%
--------



big:   	18.99%
concentrated:   	9.52%
good:   	6.09%
fine:   	6.01%
gorgeous:   	5.83%
--------



wine:   	46.86%
example:   	42.51%
concentrated:   	2.03%
and:   	1.89%
value:   	1.85%
--------



with:   	66.29%
that:   	22.96%
to:   	4.95%
for:   	4.68%
at:   	0.26%
--------



a:   	98.06%
an:   	0.32%
rich:   	0.25%
good:   	0.14%
loads:   	0.12%
--------



long:   	90.76%
rich:   	1.76%
deep:   	1.19%
hint:   	0.56%
touch:   	0.54%
--------



finish:   	85.48%
spicy:   	4.39%
mineral:   	3.58%
minerally:   	1.63%
mouthwatering:   	0.88%
--------



:   	70.27%
that:   	16.31%
drink:   	8.66%
it:   	2.14%
of:   	0.73%
--------



s:   	84.15%
offers:   	3.03%
has:   	2.76%
shows:   	2.01%
lingers:   	1.58%
--------



a:   	41.21%
worth:   	31.35%
long:   	3.55%
dry:   	3.54%
[UNK]:   	2.15%
--------



great:   	25.11%
good:   	11.87%
fine:   	10.65%
wine:   	7.78%
big:   	5.28%
--------



wine:   	97.29%
example:   	2.08%
bottle:   	0.19%
expression:   	0.18%
food:   	0.05%
--------



to:   	51.5%
with:   	21.87%
that:   	14.18%
for:   	5.48%
:   	3.91%
--------

