In [None]:
!wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt

In [None]:
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf
import numpy as np

In [None]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [None]:
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([Lowercase()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

In [None]:
trainer = BpeTrainer(
    vocab_size=50000,
    inital_alphabet=ByteLevel.alphabet(),
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
)
tokenizer.train(["austen-emma.txt"], trainer)

In [None]:
!mkdir tokenizer_gpt

In [None]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [None]:
from transformers import GPT2TokenizerFast, GPT2Config, TFGPT2LMHeadModel

In [None]:
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")

In [None]:
tokenizer_gpt.add_special_tokens(
    {
        "eos_token": "</s>",
        "bos_token": "<s>",
        "unk_token": "<unk>",
        "pad_token": "<pad>",
        "mask_token": "<mask>",
    }
)

In [None]:
tokenizer_gpt.eos_token_id

In [None]:
tokenizer_gpt.encode("<s> this is </s>")

In [None]:
config = GPT2Config(
    vocab_size=tokenizer_gpt.vocab_size,
    bos_token_id=tokenizer_gpt.bos_token_id,
    eos_token_id=tokenizer_gpt.eos_token_id,
)
model = TFGPT2LMHeadModel(config)

In [None]:
config

In [None]:
with open("austen-emma.txt", "r", encoding="utf-8") as f:
    content = f.readlines()

In [None]:
content_p = []
for c in content:
    if len(c) > 10:
        content_p.append(c.strip())

In [None]:
content_p = " ".join(content_p) + tokenizer_gpt.eos_token

In [None]:
tokenized_content = tokenizer_gpt.encode(content_p)

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(tokenized_content)):
    examples.append(tokenized_content[i : i + block_size])

In [None]:
train_data = []
labels = []
for example in examples:
    train_data.append(example[:-1])
    labels.append(example[1:])

In [None]:
# change 1000 if you want to train on full data
dataset = tf.data.Dataset.from_tensor_slices((train_data[:1000], labels[:1000]))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

In [None]:
model.compile(
    optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric]
)

In [None]:
# increase number of epochs for higher accuracy and lower loss
num_epoch = 1
history = model.fit(dataset, epochs=num_epoch)

In [None]:
def generate(start):
    input_token_ids = tokenizer_gpt.encode(start, return_tensors="tf")
    output = model.generate(
        input_token_ids,
        max_length=10,
        num_beams=5,
        temperature=0.7,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
    )
    return tokenizer_gpt.decode(output[0])

In [None]:
generate(" ")

In [None]:
generate("wetson was very good")

In [None]:
!mkdir my_gpt-2

In [None]:
model.save_pretrained("my_gpt-2/")

In [None]:
model_reloaded = TFGPT2LMHeadModel.from_pretrained("my_gpt-2/")

In [None]:
from transformers import (
    WEIGHTS_NAME,
    CONFIG_NAME,
    TF2_WEIGHTS_NAME,
    AutoModel,
    AutoTokenizer,
)

In [None]:
tokenizer_gpt.save_pretrained("tokenizer_gpt_auto/")

In [None]:
model = AutoModel.from_pretrained("my_gpt-2/", from_tf=True)
tokenizer = AutoTokenizer.from_pretrained("tokenizer_gpt_auto")