### Notebook focuses on loading and preparing datasets from Hugging Face

Goal is to write dataset loaders tha can
* support streaming and shuffling
* creates test and validation splits
* are debuggable
* handle tokenization, encoding / decoding
* creates x,y pairs for training (shift by one and potentially other strategies)

---
### TODO
* Prepare datasets for **finetuning** (IT and Classification)
* Masking collation (BERT style)
* FIM collation (used for code completion)
  
### References:
* [tiktoken](https://github.com/openai/tiktoken) - I'll likely stick to gpt2 BPE encoding for size reasons
* [huggingface datasets](https://huggingface.co/docs/datasets/en/stream) - shows how to stream huggingface datasets
* datasets
  - karpathy/tiny_shakespeare (the dataset used in Andrej Karpathy's blog post about the unreasonable effectiveness of RNNs)
  - HuggingFaceFW/fineweb-edu (10B tokens, curated to be high quality)
  - imdb or rotten tomatoes (for classification)
  - TBD for instruction tuning

In [2]:
!uv add datasets tiktoken
import logging
from pprint import pprint, pformat

import datasets as hf_datasets
import tiktoken
import torch

log: logging.Logger = logging.getLogger(__name__)

[2mResolved [1m145 packages[0m [2min 0.47ms[0m[0m
[2mAudited [1m140 packages[0m [2min 0.02ms[0m[0m


In [67]:
import datasets as hf_datasets

class ShakespeareDataloader:
    DATASET_PATH = 'karpathy/tiny_shakespeare'
    
    def __init__(self, batch_size, sequence_length, tokenizer, text_col="text", split="train", shuffle=True):
        """Causal Dataloader for the 'karpathy/tiny_shakespeare' dataset, loaded fully into memory."""
        self._name = f"{self.__class__.__name__}"
        self.B = batch_size
        self.T = sequence_length
        self.split = split        
        self.tokens_per_step = self.B * self.T
    
        assert self.tokens_per_step > 0, f"invalid number of tokens per step {self.B} * {self.T} = {self.tokens_per_step}"

        print(f"{self._name} Initializing: {ShakespeareDataset.DATASET_PATH} with B={self.B}, T={self.T}, split='{self.split}'")
        dataset = hf_datasets.load_dataset(ShakespeareDataset.DATASET_PATH, name="default", split=self.split, streaming=False)
        if shuffle:
            dataset = dataset.shuffle(seed=42)
    
        print(f"{self._name} Pre-tokenizing text data n={len(dataset['text'][0]):,} for split '{self.split}'... ", end='')
        
        tokenized_dataset = dataset.map(tokenizer, batched=True)
        
        flat_tokens = [token for tokens in tokenized_dataset['tokens'] for token in tokens]
        
        self.all_tokens = torch.tensor(flat_tokens, dtype=torch.long)
        self.total_tokens = len(self.all_tokens)
        self.yieldable_batches = (self.total_tokens - 1) // (self.tokens_per_step) # note the -1 because we need 1 more token to create y
        
        print(f"estimated batches: {self.yieldable_batches}")
        assert self.yieldable_batches > 0, f"not enough tokens to yield a full batch. {self.total_tokens}, {self.tokens_per_step}"

        # for iter
        self.current_idx = 0
        self.batches_yielded = 0

    def __iter__(self):
        """each new epoch will move the iter to the start of the dataset."""
        self.current_idx = 0
        self.batches_yielded = 0
        print(f"{self._name} iterator reset for split '{self.split}', starting at token 0")
        return self

    def __next__(self):
        """
        Yield the next batch of token indices where y is shifted by 1 from x.

        Returns:
            a tuple (x, y) where both tensors have shape (B, T) 
        """
        start_idx = self.current_idx
        
        if start_idx + self.tokens_per_step + 1 > self.total_tokens:
            print(f"{self._name} No more tokens: __next__: Not enough tokens for a full batch from index {start_idx}. "
                  f"Needed {self.tokens_per_step}, available: {self.total_tokens - start_idx} tokens.")
            raise StopIteration

        batch_tokens = self.all_tokens[start_idx : start_idx + self.tokens_per_step + 1]
        
        x = batch_tokens[:-1].view(self.B, self.T)
        y = batch_tokens[1:].view(self.B, self.T)
        
        self.current_idx += self.tokens_per_step
        self.batches_yielded += 1
        
        return x, y

    def __len__(self):
        """Returns the estimated number of batches per epoch."""
        return self.yieldable_batches

In [68]:
from collections import Counter
import itertools

def print_xy(loader, tokenizer, n=2, tokens=15):
    x, y = next(iter(loader))
    for i in range(min(n, x.shape[0])):
        xs, ys = x[i][:tokens], y[i][:tokens]
        print(f"x: {xs.tolist()}")
        print(f"y: {ys.tolist()}")
        print(f"x: {repr(tokenizer.decode(xs.tolist()))}")
        print(f"y: {repr(tokenizer.decode(ys.tolist()))}")
        
def check_data_quality(loader, tokenizer, num_batches=5):
    """Simple data quality tests to ensure basic functionality is working"""
    total_tokens = 0
    freq_counts = Counter()

    for (input_batch, target_batch) in itertools.islice(loader, num_batches):
        freq_counts.update(input_batch.flatten().tolist())
        total_tokens += input_batch.numel()

    print(f"Total tokens analyzed: {total_tokens:,}")
    print(f"Unique tokens: {len(freq_counts)}")
    print(f"Top 10 tokens:")
 
    for token_id, count in freq_counts.most_common(10):
        token_text = repr(tokenizer.decode([token_id]))
        frac = (count / total_tokens)
        print(f"  ID {token_id:<5d} ({token_text:<10}): {count:<6,d} ({frac:.4f})")

    # check for coverage issues (generally too little)
    vocab_coverage = len(freq_counts) / tokenizer.n_vocab
    print(f"Vocabulary coverage: {vocab_coverage}")

    # print x, y pairs to verify collation
    print_xy(loader, tokenizer)
    return freq_counts

gpt2 = tiktoken.get_encoding('gpt2')
tokenizer = lambda r: {'tokens': gpt2.encode_batch(r['text'])}
check_data_quality(ShakespeareDataloader(5, 1024, tokenizer), gpt2, 20)
""

ShakespeareDataloader Initializing: karpathy/tiny_shakespeare with B=5, T=1024, split='train'
ShakespeareDataloader Pre-tokenizing text data n=1003854 for split 'train'... estimated batches: 58
ShakespeareDataloader iterator reset for split 'train', starting at token 0
Total tokens analyzed: 102,400
Unique tokens: 7013
Top 10 tokens:
  ID 198   ('\n'      ): 12,382 (0.1209)
  ID 11    (','       ): 5,909  (0.0577)
  ID 25    (':'       ): 3,139  (0.0307)
  ID 13    ('.'       ): 2,362  (0.0231)
  ID 262   (' the'    ): 1,753  (0.0171)
  ID 284   (' to'     ): 1,298  (0.0127)
  ID 286   (' of'     ): 1,090  (0.0106)
  ID 290   (' and'    ): 1,083  (0.0106)
  ID 26    (';'       ): 1,003  (0.0098)
  ID 314   (' I'      ): 997    (0.0097)
Vocabulary coverage: 0.13954275026364488
ShakespeareDataloader iterator reset for split 'train', starting at token 0
x: [5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198]
y: [22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3

''