In [None]:
from transformers import AutoTokenizer

In [None]:
# Load the tokenizer we've trained.
tokenizer = AutoTokenizer.from_pretrained("ikit-claw-nlp/toy-llm")

In [None]:
def generate_training_data(tokenizer, window_size, step_length):
    corpus_file = ["data/text/article_200000-300000.txt"]
    for corpus_f in corpus_file:
        with open(corpus_f, 'r') as f_handle:
            current_corpus = f_handle.read()
            current_corpus = current_corpus[:20000]
            tokenized_current_corpus_input_ids = tokenizer(current_corpus)['input_ids']
            print("len of input ids", len(tokenized_current_corpus_input_ids))
        for idx in range(0, len(tokenized_current_corpus_input_ids) - window_size, step_length):
            # Note that, in there we drop the last part of the corpus if it cannot form a full-size window.
            # we do not use <pad> to pad the last part of the corpus.
            input_ids = tokenized_current_corpus_input_ids[idx : idx + window_size]
            output_ids = tokenized_current_corpus_input_ids[idx + 1 : idx + 1 + window_size]
            yield input_ids, output_ids
            # For those who want to handle the boundary case:
            # [a, b, c, d, e, f] => text. Step_size = 2, window_size = 4
            # First window: (input) [a, b, c, d] (output) [b, c, d, e] => idx = 0
            # second window: (input) [c, d, e, f] (output) [d, e, f, pad] => idx = 2
            # third window: (input) [e, f, pad, pad] (output) [f, pad, pad] => idx = 4
            # the number of pad = idx + window_size - len(text) for input.
            #                   = idx + 1 + window_size - len(text) for output.
            # I choose to ignore using <pad> as input in training.

In [None]:
data_iter = generate_training_data(tokenizer, 256, 128)

In [None]:
for input_tensor, output_tensor in data_iter:
    print(len(input_tensor))
    print(len(output_tensor))