In [3]:
# !wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt


--2025-03-23 13:39:06--  https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 887071 (866K) [text/plain]
Saving to: ‘austen-emma.txt’


2025-03-23 13:39:06 (14.7 MB/s) - ‘austen-emma.txt’ saved [887071/887071]



In [4]:
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
import numpy as np

In [5]:
# import the BPE tokenizer from the tokenizers library

from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [7]:
# Lowercase normalization

# makes a tokenizer from the BPE tokenizer class
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    Lowercase()
])

# For the normalization part, Lowercase has been added,
# and the pre_tokenizer attribute is set to ByteLevel to ensure we have bytes as our input.
# The decoder attribute must be also set to ByteLevelDecoder to be able to decode correctly.

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [8]:
# tokenizer be trained using a 50000 maximum vocabulary size and an initial alphabet from ByteLevel

trainer = BpeTrainer(vocab_size=50000, inital_alphabet=ByteLevel.alphabet(),
                     special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>"
                    ])
tokenizer.train(["austen-emma.txt"], trainer)

In [9]:
!mkdir tokenizer_gpt

In [11]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [12]:
from transformers import GPT2TokenizerFast, GPT2Config, TFGPT2LMHeadModel

In [13]:
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")


In [15]:
tokenizer_gpt.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

0

In [17]:
tokenizer_gpt.eos_token_id

# This code will output the End-of-Sentence (EOS) token Identifier (ID)
#, which is 2 for the current tokenizer.

2

In [18]:
tokenizer_gpt.encode("<s> this is </s>")

[0, 265, 157, 56, 2]

In [20]:
config = GPT2Config(
  vocab_size=tokenizer_gpt.vocab_size,
  bos_token_id=tokenizer_gpt.bos_token_id,
  eos_token_id=tokenizer_gpt.eos_token_id
)
model = TFGPT2LMHeadModel(config)

In [21]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 11750
}

In [22]:
with open("austen-emma.txt", "r", encoding='utf-8') as f:
    content = f.readlines()

In [23]:
content_p = []
for c in content:
    if len(c)>10:
        content_p.append(c.strip())

In [24]:
content_p = " ".join(content_p)+tokenizer_gpt.eos_token

In [25]:

tokenized_content = tokenizer_gpt.encode(content_p)

In [26]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(tokenized_content)):
    examples.append(tokenized_content[i:i + block_size])

In [27]:
train_data = []
labels = []
for example in examples:
    train_data.append(example[:-1])
    labels.append(example[1:])

In [28]:
# change 1000 if you want to train on full data
dataset = tf.data.Dataset.from_tensor_slices((train_data[:1000], labels[:1000]))

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [29]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)


In [30]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


In [31]:
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')


In [32]:
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])


In [None]:
# increase number of epochs for higher accuracy and lower loss

num_epoch = 1
history = model.fit(dataset, epochs=num_epoch)

In [34]:
def generate(start):
    input_token_ids = tokenizer_gpt.encode(start, return_tensors='tf')
    output = model.generate(
        input_token_ids,
        max_length = 10,
        num_beams = 5,
        temperature = 0.7,
        no_repeat_ngram_size=2,
        num_return_sequences=1
    )
    return tokenizer_gpt.decode(output[0])

In [35]:
generate(" ")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


' , the,, her her, a,'

In [36]:
generate("wetson was very good")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'wetson was very good,, her,'

In [37]:
!mkdir my_gpt-2

In [38]:
model.save_pretrained("my_gpt-2/")

In [39]:

model_reloaded = TFGPT2LMHeadModel.from_pretrained("my_gpt-2/")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at my_gpt-2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [40]:
from transformers import WEIGHTS_NAME, CONFIG_NAME, TF2_WEIGHTS_NAME, AutoModel, AutoTokenizer


In [41]:
tokenizer_gpt.save_pretrained("tokenizer_gpt_auto/")


('tokenizer_gpt_auto/tokenizer_config.json',
 'tokenizer_gpt_auto/special_tokens_map.json',
 'tokenizer_gpt_auto/vocab.json',
 'tokenizer_gpt_auto/merges.txt',
 'tokenizer_gpt_auto/added_tokens.json',
 'tokenizer_gpt_auto/tokenizer.json')

In [42]:
model = AutoModel.from_pretrained("my_gpt-2/", from_tf = True)
tokenizer = AutoTokenizer.from_pretrained("tokenizer_gpt_auto")

All TF 2.0 model weights were used when initializing GPT2Model.

All the weights of GPT2Model were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2Model for predictions without further training.
