In [1]:
import tensorflow as tf
import tensorflow.keras as keras

from transformers import pipeline, TFAutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, create_optimizer, AdamWeightDecay
import datasets

In [2]:
d = datasets.load_dataset("tiny_shakespeare", split="train")

Using custom data configuration default
Reusing dataset tiny_shakespeare (/home/jon/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

2022-08-17 22:46:18.371948: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-17 22:46:18.776742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22309 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:68:00.0, compute capability: 8.6
2022-08-17 22:46:19.037246: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-08-17 22:46:19.597857: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
All model checkpoint layers were used

In [4]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [5]:
pad_token_id = tokenizer.get_added_vocab()["[PAD]"]
pad_token_id

50257

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length")

In [7]:
lm_dataset = d.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/jon/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e/cache-1bbf61aa6afd4bba.arrow


In [8]:
def dataset_from_input_ids(input_ids, window_size=100, batch_size=32, shuffle_buffer_size=10000):
    return tf.data.Dataset.from_tensor_slices(input_ids) \
        .window(window_size+1, stride=1, drop_remainder=True) \
        .flat_map(lambda w: w.batch(window_size+1)) \
        .map(lambda w: {"input_ids": w[:-1], "attention_mask": tf.fill([window_size], 1), "labels": w[1:]}) \
        .shuffle(shuffle_buffer_size) \
        .batch(batch_size) \
        .prefetch(1)
    
    
input_ids = lm_dataset["input_ids"][0]
split_idx = (len(input_ids) * 90) // 100

train_ds = dataset_from_input_ids(input_ids[:split_idx])
val_ds = dataset_from_input_ids(input_ids[split_idx:])

In [9]:
for x in train_ds.take(1):
    print(x["input_ids"])
    print(x["attention_mask"])
    print(x["labels"])

tf.Tensor(
[[ 2744  1669    72 ...  1494  1549   198]
 [  326   561  6546 ...   561  5380   514]
 [17862    13   198 ...   198  2514  4467]
 ...
 [  514   284  6731 ... 13889    25   198]
 [  318  3750    11 ...    25   198    46]
 [   30   198  2348 ...    11 12891    11]], shape=(32, 100), dtype=int32)
tf.Tensor(
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]], shape=(32, 100), dtype=int32)
tf.Tensor(
[[ 1669    72    25 ...  1549   198  1820]
 [  561  6546   616 ...  5380   514    11]
 [   13   198 12322 ...  2514  4467   345]
 ...
 [  284  6731    25 ...    25   198   464]
 [ 3750    11   290 ...   198    46    11]
 [  198  2348   292 ... 12891    11   290]], shape=(32, 100), dtype=int32)


In [10]:
input_length = 10
generate_length = 40
num_sequences = 5

for item in val_ds.take(1):
    example_input_ids = item["input_ids"]

def generate_text(model, input_ids):
    output_ids = model.generate(
        input_ids=input_ids,
        do_sample=True,
        max_length=generate_length + input_length,
        temperature=1.0,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1.0,
        num_return_sequences=num_sequences,
        pad_token_id=pad_token_id,
    )
    
    return tokenizer.batch_decode(output_ids)


outputs = generate_text(model, example_input_ids[:3, :input_length])
for output in outputs:
    print("===================")
    print(output)

 afterwards.

ABHORSON:
We can only hope that when the wishes of the newspaper, first and foremost, we will not hesitate to return to it.
DANIEL BIONCERT:
Within a year, Mr
 afterwards.

ABHORSON:
Our scenario involves free-falling and a new set of mission challenges based on the discovery of the fundamental origins of Mercury. The first five missions are planned and set up by post-flight operations and
 afterwards.

ABHORSON:
Lima said in the immigration issue, her client was worried about not being heard, but that it was safe to say it was not an opportunity for anyone to get in contact with her.
AB
 afterwards.

ABHORSON:
The Treaty of Lincoln reached an agreement with Britain on October 9, 1825, following a six-day open debate about a clause of human rights prohibiting the invading force of the British.
In subsequent
 afterwards.

ABHORSON:
How will you explain this being a product of the NHS?
CHARLOTT:
Within a few weeks, not so long ago, most of us paid a service bill to people t

In [11]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [12]:
model.compile(optimizer=optimizer, run_eagerly=True)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [13]:
model.fit(x=train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7bf83d8dc0>

In [14]:
outputs = generate_text(model, example_input_ids[:3, :input_length])
for output in outputs:
    print("===================")
    print(output)

 afterwards.

ABHORSON:
 that grace mind the and.
AD ED IV
BING:Good think I even prison would
 I that prison I live my; but hope
is not my land;What can'd do
 afterwards.

ABHORSON:
, again, no no
 I am in one
 senate to senate thou ty.
ISELL:O, shame my, that I
 notford's nor men away many
ish look'd
 afterwards.

ABHORSON:
, I go to Ert andTh;I the more
 live all inul asement
 lie!- living drops eyes
't my
 amby great have sport them butI their
 afterwards.

ABHORSON:
'll me? sir sir,Will a good husband a to himself
ty's and more?
BRUS:No
 I a look in, meet like chamber
 some shade that vite silence
 afterwards.

ABHORSON:
at that you promised you
 might keep sir gain and shall make
 course to return pursuit thy, within.Beours to know,To bears me up town
 soon will make againAnd again
?

LUCIO:
Good even's thosecius
an thatest of when were;
cius that was teach best noble
 I.
UT:Th twice andHe in with: I have
 drunk but thecius of pride
?

LUCIO:
Good even may good those; will
 you b