In [None]:
from transformers import AutoTokenizer
import glob
from torch.utils.data import DataLoader, IterableDataset
import torch

Test iterator.

In [None]:
class CorpusDataset(IterableDataset):
    def __init__(self, file_list, window_size, step_length, tokenizer):
        super(CorpusDataset).__init__()
        self.file_list = file_list
        self.window_size = window_size
        self.step_length = step_length
        self.tokenizer = tokenizer
    def __iter__(self):
        # For those who want to handle the boundary case:
        # [a, b, c, d, e, f] => text. Step_size =s 2, window_size = 4
        # First window: (input) [a, b, c, d] (output) [b, c, d, e] => idx = 0
        # second window: (input) [c, d, e, f] (output) [d, e, f, pad] => idx = 2
        # third window: (input) [e, f, pad, pad] (output) [f, pad, pad] => idx = 4
        # the number of pad = idx + window_size - len(text) for input.
        #                   = idx + 1 + window_size - len(text) for output.
        # I choose to ignore using <pad> as input in training.
        for corpus_f in self.file_list:
            with open(corpus_f, 'r') as f_handle:
                print(f"Loading the dataset {corpus_f} into memory...")
                current_corpus = f_handle.read()
                print("Converting the dataset to token ids...")
                tokenized_current_corpus_input_ids = self.tokenizer.encode(current_corpus,
                                                                      return_tensors="pt")
                tokenized_current_corpus_input_ids = torch.squeeze(tokenized_current_corpus_input_ids)
                print("Conversion Complete.", tokenized_current_corpus_input_ids.shape,
                      "Tokens in the corpus.")
            for idx in range(0, len(tokenized_current_corpus_input_ids) - self.window_size, self.step_length):
                # Note that, in there we drop the last part of the corpus if it cannot form a full-size window.
                # we do not use <pad> to pad the last part of the corpus.
                input_ids = tokenized_current_corpus_input_ids[idx : idx + self.window_size]
                output_ids = tokenized_current_corpus_input_ids[idx + 1 : idx + 1 + self.window_size]
                yield input_ids, output_ids

In [None]:
# Load the tokenizer we've trained.
tokenizer = AutoTokenizer.from_pretrained("ikit-claw-nlp/toy-llm")
print("The vocabulary has", tokenizer.vocab_size, "tokens.")

In [None]:
file_list = glob.glob("data/full_text/*.txt")
corpus_dataset = CorpusDataset(file_list=file_list,
                               window_size=16,
                               step_length=1,
                               tokenizer=tokenizer)
data_loader = DataLoader(dataset=corpus_dataset, batch_size=8)

Verify outputs.

In [None]:
for x, y in data_loader:
    print(x.shape, y.shape)
    break

In [None]:
for decode_x, decode_y in zip(tokenizer.batch_decode(x), tokenizer.batch_decode(y)):
    print("Input", decode_x)
    print("Output", decode_y)

In [None]:
with open(corpus_dataset.file_list[0], 'r', encoding='utf8') as f_handle:
    verify_corpus = f_handle.read()

In [None]:
verify_corpus[:1000]

Verify Code

In [None]:
from libs.CorpusDataset import CorpusDataset
from transformers import AutoTokenizer
import glob
from torch.utils.data import DataLoader, IterableDataset
import torch
# Load the tokenizer we've trained.
tokenizer = AutoTokenizer.from_pretrained("ikit-claw-nlp/toy-llm")
print("The vocabulary has", tokenizer.vocab_size, "tokens.")
file_list = glob.glob("data/full_text/*.txt")
corpus_dataset = CorpusDataset(file_list=file_list,
                               window_size=256,
                               step_length=128,
                               tokenizer=tokenizer)
data_loader = DataLoader(dataset=corpus_dataset, batch_size=10000)
for x, y in data_loader:
    print(x.shape, y.shape)