In [30]:
import numpy as np
import tensorflow as tf
from transformers import GPT2Tokenizer
from datasets import load_dataset
from tensorflow import keras
from transformers import DefaultDataCollator
from itertools import chain
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling

In [94]:
batch_size = 128
maxlen = 80
max_train_samples = 1000
max_eval_samples = 1000
val_split = 0.5

In [103]:
dataset_raw = load_dataset("wikitext", "wikitext-2-v1")

Reusing dataset wikitext (/home/acozma/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

In [104]:
print(dataset_raw.shape)
column_names = dataset_raw.column_names
print(column_names)
print()
print(dataset_raw["train"][3])
print(dataset_raw["test"][3])

{'test': (4358, 1), 'train': (36718, 1), 'validation': (3760, 1)}
{'test': ['text'], 'train': ['text'], 'validation': ['text']}

{'text': ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n'}
{'text': ' Robert <unk> is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was f

In [105]:
# create_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
create_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
create_tokenizer.pad_token = create_tokenizer.eos_token

tokenizer_vars = vars(create_tokenizer)

for key, value in tokenizer_vars.items():
    if not isinstance(value, (list, dict)):
        print(f"{key}: {value}")
    elif isinstance(value, list):
        print(f"{key}: [list] {len(value)} items")
    elif isinstance(value, dict):
        print(f"{key}: [dict] {len(value)} items")

_tokenizer: <tokenizers.Tokenizer object at 0x22eaf3d0>
_decode_use_source_tokenizer: False
init_inputs: ()
init_kwargs: [dict] 7 items
name_or_path: distilgpt2
_processor_class: None
model_max_length: 1024
padding_side: right
truncation_side: right
model_input_names: [list] 2 items
_bos_token: <|endoftext|>
_eos_token: <|endoftext|>
_unk_token: <|endoftext|>
_sep_token: None
_pad_token: <|endoftext|>
_cls_token: None
_mask_token: None
_pad_token_type_id: 0
_additional_special_tokens: [list] 0 items
verbose: True
add_prefix_space: False


In [116]:
def tokenize_function(examples):
    # print(examples["text"])
    return create_tokenizer(examples["text"])

tokenized_datasets = dataset_raw.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset_raw['train'].column_names,
    load_from_cache_file=False,
    desc="Running create_tokenizer on dataset",
)

      

Running create_tokenizer on dataset #1:   0%|          | 0/2 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #0:   0%|          | 0/2 [00:00<?, ?ba/s]

  

Running create_tokenizer on dataset #2:   0%|          | 0/2 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #3:   0%|          | 0/2 [00:00<?, ?ba/s]

      

Running create_tokenizer on dataset #0:   0%|          | 0/10 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #1:   0%|          | 0/10 [00:00<?, ?ba/s]

  

Running create_tokenizer on dataset #2:   0%|          | 0/10 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #3:   0%|          | 0/10 [00:00<?, ?ba/s]

      

Running create_tokenizer on dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Running create_tokenizer on dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Running create_tokenizer on dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [117]:
print(tokenized_datasets["train"][4])

{'input_ids': [383, 983, 2540, 2478, 287, 3050, 837, 6872, 625, 257, 1588, 6903, 286, 262, 670, 1760, 319, 569, 18354, 7496, 17740, 2873, 764, 2893, 340, 17383, 262, 3210, 3033, 286, 262, 2168, 837, 340, 635, 25289, 3294, 16895, 837, 884, 355, 1642, 262, 983, 517, 1279, 2954, 29, 329, 2168, 29661, 764, 15684, 11915, 1279, 2954, 29, 8835, 73, 280, 290, 26777, 7286, 13704, 13231, 43354, 1111, 4504, 422, 2180, 12784, 837, 1863, 351, 569, 18354, 7496, 17740, 2873, 3437, 33687, 5303, 18024, 6909, 764, 317, 1588, 1074, 286, 8786, 12118, 262, 4226, 764, 383, 983, 705, 82, 4756, 7505, 373, 23568, 416, 1737, 705, 77, 764, 220, 198], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [121]:

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= maxlen:
        total_length = (total_length // maxlen) * maxlen
    # total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + maxlen] for i in range(0, total_length, maxlen)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
    load_from_cache_file=False,
    desc=f"Grouping texts in chunks of {maxlen}",
)

      

Grouping texts in chunks of 80 #0:   0%|          | 0/2 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #1:   0%|          | 0/2 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 80 #3:   0%|          | 0/2 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #2:   0%|          | 0/2 [00:00<?, ?ba/s]

      

Grouping texts in chunks of 80 #0:   0%|          | 0/10 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #1:   0%|          | 0/10 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 80 #2:   0%|          | 0/10 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #3:   0%|          | 0/10 [00:00<?, ?ba/s]

      

Grouping texts in chunks of 80 #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

Grouping texts in chunks of 80 #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 80 #3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [80]:
# data_collator = DefaultDataCollator(return_tensors="tf")
# options = tf.data.Options()
# options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
#
# train_dataset = dataset_raw["train"]
# tf_train_dataset = train_dataset.to_tf_dataset(
#     # labels are passed as input, as we will use the model's internal loss
#     columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
#     shuffle=True,
#     batch_size=batch_size,
#     collate_fn=data_collator,
#     drop_remainder=True,
# ).with_options(options)

InvalidArgumentError: Length for attr 'output_shapes' of 0 must be at least minimum 1
	; NodeDef: {{node MapDataset}}; Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=metadata:string,default=""> [Op:MapDataset]

In [123]:
data_collator = DataCollatorForLanguageModeling(create_tokenizer=create_tokenizer, mlm=False, return_tensors="tf")
# options = tf.data.Options()
# options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

# tf_train_dataset = dataset_raw.to_tf_dataset(
#     # labels are passed as input, as we will use the model's internal loss
#     columns=[col for col in dataset_raw.features if col != "special_tokens_mask"],
#     shuffle=True,
#     batch_size=batch_size,
#     collate_fn=data_collator,
#     drop_remainder=True,
# )#.with_options(options)

tf_train_set = lm_datasets["train"].to_tf_dataset(
    columns=["input_ids", "labels"],
    dummy_labels=True, # TODO: Idk what this does
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [125]:
for d in tf_train_set.take(5):
    print(d)

({'input_ids': <tf.Tensor: shape=(128, 80), dtype=int64, numpy=
array([[ 9471,   764,   383, ...,   422,  1903, 16713],
       [  262,  3833,   764, ...,  5249,   743,  3051],
       [  366,   290,   788, ...,  1279,  2954,    29],
       ...,
       [ 8701,   274,   290, ...,  1279,  2954,    29],
       [ 5096,   329,  4025, ...,  2954,    29,  1869],
       [ 2911,   286,  4917, ..., 29666,  2488,    12]])>, 'labels': <tf.Tensor: shape=(128, 80), dtype=int64, numpy=
array([[ 9471,   764,   383, ...,   422,  1903, 16713],
       [  262,  3833,   764, ...,  5249,   743,  3051],
       [  366,   290,   788, ...,  1279,  2954,    29],
       ...,
       [ 8701,   274,   290, ...,  1279,  2954,    29],
       [ 5096,   329,  4025, ...,  2954,    29,  1869],
       [ 2911,   286,  4917, ..., 29666,  2488,    12]])>, 'attention_mask': <tf.Tensor: shape=(128, 80), dtype=int64, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,


In [39]:
def create_sequences(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    print(text)
    return text


text_ds = tf_train_dataset.map(create_sequences)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

{'labels': <tf.Tensor 'args_2:0' shape=(128, None) dtype=int64>, 'input_ids': <tf.Tensor 'args_1:0' shape=(128, None) dtype=int64>, 'attention_mask': <tf.Tensor 'args_0:0' shape=(128, None) dtype=int64>}


In [22]:
class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
            self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_top_k(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_top_k(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "this movie is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [23]:
model = create_model()

model.fit(text_ds, verbose=2, epochs=25, callbacks=[text_gen_callback])

Epoch 1/25


2022-05-16 05:00:01.657043: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


generated text:
this movie is the only to be better than a film . the characters i can have a few laughs from start . the only reason i have seen . this is a great film in my opinion . i 've seen the story

391/391 - 33s - loss: 5.5994 - dense_67_loss: 5.5994 - 33s/epoch - 85ms/step
Epoch 2/25
generated text:
this movie is so funny , but that has a good way . the script is not a good film . it 's really a little more than one . it 's the funniest film i 've ever seen . it was a shame

391/391 - 29s - loss: 4.7098 - dense_67_loss: 4.7098 - 29s/epoch - 75ms/step
Epoch 3/25
generated text:
this movie is one of the funniest , i was so disappointed with the acting in my opinion of a movie . i have to say the story of the movie is based on a true story . it also is based on an

391/391 - 29s - loss: 4.4620 - dense_67_loss: 4.4620 - 29s/epoch - 74ms/step
Epoch 4/25
generated text:
this movie is a complete bore . the story itself is a very good movie . i don 't know how much of [UNK] " and [UN

KeyboardInterrupt: 