# 02: Prepare the Duckling Corpus

Download TinyStories from HuggingFace, tokenize with our truncated GPT-2 tokenizer,
and save as a flat tensor for fast training.

---

*Jeffery Harrell & Alpha, December 1, 2025*

## Parameters

In [1]:
from pathlib import Path

TOKENIZER_PATH = Path("data/tokenizer")
OUTPUT_PATH = Path("data/corpus")

# How many stories to use (None = all)
# TinyStories has ~2.1M stories, but we don't need all of them
# For Chinchilla-optimal ~58M tokens, we need roughly 100-200K stories
MAX_STORIES = 200_000  # Should give us plenty of tokens

# Sequence length for training
SEQ_LEN = 512

print(f"Tokenizer: {TOKENIZER_PATH}")
print(f"Output: {OUTPUT_PATH}")
print(f"Max stories: {MAX_STORIES:,}")
print(f"Sequence length: {SEQ_LEN}")

Tokenizer: data/tokenizer
Output: data/corpus
Max stories: 200,000
Sequence length: 512


## Load Tokenizer

In [2]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_PATH)
print(f"Loaded tokenizer: {len(tokenizer):,} tokens")
print(f"PAD token: {repr(tokenizer.pad_token)} (id={tokenizer.pad_token_id})")
print(f"EOS token: {repr(tokenizer.eos_token)} (id={tokenizer.eos_token_id})")

Loaded tokenizer: 8,000 tokens
PAD token: '<|pad|>' (id=7999)
EOS token: '<|endoftext|>' (id=7998)


## Download TinyStories

In [3]:
from datasets import load_dataset

print("Loading TinyStories from HuggingFace...")
dataset = load_dataset("roneneldan/TinyStories", split="train")

print(f"Total stories available: {len(dataset):,}")
print(f"Using first {MAX_STORIES:,} stories")

# Take subset
dataset = dataset.select(range(min(MAX_STORIES, len(dataset))))
print(f"Selected: {len(dataset):,} stories")

Loading TinyStories from HuggingFace...


Total stories available: 2,119,719
Using first 200,000 stories
Selected: 200,000 stories


In [4]:
# Peek at a sample
print("Sample story:")
print("=" * 50)
print(dataset[0]["text"][:500])
print("...")

Sample story:
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them b
...


## Tokenize All Stories

In [5]:
from tqdm.auto import tqdm

print("Tokenizing stories...")

all_tokens = []
oov_count = 0  # Out of vocab (tokens that get mapped to unk)

for story in tqdm(dataset):
    text = story["text"]
    tokens = tokenizer.encode(text)
    # Add EOS token after each story
    tokens.append(tokenizer.eos_token_id)
    all_tokens.extend(tokens)

print(f"\nTotal tokens: {len(all_tokens):,}")
print(f"Tokens per story (avg): {len(all_tokens) / len(dataset):.1f}")

Tokenizing stories...


  0%|          | 0/200000 [00:00<?, ?it/s]


Total tokens: 51,110,591
Tokens per story (avg): 255.6


In [6]:
# Check token distribution
import numpy as np

tokens_array = np.array(all_tokens)
max_token = tokens_array.max()
min_token = tokens_array.min()

print(f"Token ID range: [{min_token}, {max_token}]")
print(f"Vocab size: {len(tokenizer)}")

if max_token >= len(tokenizer):
    print(f"WARNING: Max token {max_token} >= vocab size {len(tokenizer)}!")
else:
    print(f"✓ All tokens within vocab range")

Token ID range: [0, 7998]
Vocab size: 8000
✓ All tokens within vocab range


## Save as Tensor

In [7]:
import torch
from safetensors.torch import save_file

# Convert to tensor
tokens_tensor = torch.tensor(all_tokens, dtype=torch.long)

# Save
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
save_file({"tokens": tokens_tensor}, OUTPUT_PATH / "tokens.safetensors")

file_size = (OUTPUT_PATH / "tokens.safetensors").stat().st_size
print(f"Saved: {OUTPUT_PATH / 'tokens.safetensors'}")
print(f"File size: {file_size / 1e6:.1f} MB")
print(f"Tokens: {len(tokens_tensor):,}")

Saved: data/corpus/tokens.safetensors
File size: 408.9 MB
Tokens: 51,110,591


## Create Dead Token Mask

In [8]:
# The embedding matrix will have 10,000 rows
# Rows 0-7999: live tokens (from tokenizer)
# Rows 8000-9999: phantom dead tokens

TOTAL_VOCAB = 10_000
LIVE_VOCAB = len(tokenizer)  # 8000
DEAD_VOCAB = TOTAL_VOCAB - LIVE_VOCAB  # 2000

# Create mask: True for dead tokens
dead_mask = torch.zeros(TOTAL_VOCAB, dtype=torch.bool)
dead_mask[LIVE_VOCAB:] = True

# Also save the dead token IDs explicitly
dead_token_ids = torch.arange(LIVE_VOCAB, TOTAL_VOCAB, dtype=torch.long)

save_file({
    "dead_mask": dead_mask,
    "dead_token_ids": dead_token_ids,
}, OUTPUT_PATH / "dead_tokens.safetensors")

print(f"Dead token mask saved")
print(f"Live tokens: {LIVE_VOCAB:,} (IDs 0-{LIVE_VOCAB-1})")
print(f"Dead tokens: {DEAD_VOCAB:,} (IDs {LIVE_VOCAB}-{TOTAL_VOCAB-1})")
print(f"Total vocab: {TOTAL_VOCAB:,}")

Dead token mask saved
Live tokens: 8,000 (IDs 0-7999)
Dead tokens: 2,000 (IDs 8000-9999)
Total vocab: 10,000


## Training Budget Check

In [9]:
# Chinchilla scaling check
# Model: ~2.9M params, optimal tokens ≈ 20 × params = 58M

total_tokens = len(tokens_tensor)
model_params = 2_900_000
chinchilla_optimal = 20 * model_params

print(f"Corpus tokens: {total_tokens:,}")
print(f"Chinchilla optimal: {chinchilla_optimal:,}")
print(f"Coverage: {total_tokens / chinchilla_optimal * 100:.1f}%")

if total_tokens >= chinchilla_optimal:
    print(f"\n✓ Corpus is large enough for Chinchilla-optimal training")
else:
    epochs_needed = chinchilla_optimal / total_tokens
    print(f"\n⚠ Would need {epochs_needed:.1f} epochs to reach Chinchilla-optimal")

Corpus tokens: 51,110,591
Chinchilla optimal: 58,000,000
Coverage: 88.1%

⚠ Would need 1.1 epochs to reach Chinchilla-optimal


## Summary

In [10]:
print("=" * 50)
print("DUCKLING CORPUS READY")
print("=" * 50)
print(f"Stories: {len(dataset):,}")
print(f"Total tokens: {len(tokens_tensor):,}")
print(f"Live vocab: {LIVE_VOCAB:,}")
print(f"Dead vocab: {DEAD_VOCAB:,}")
print(f"Total vocab: {TOTAL_VOCAB:,}")
print(f"\nFiles:")
print(f"  {OUTPUT_PATH / 'tokens.safetensors'}")
print(f"  {OUTPUT_PATH / 'dead_tokens.safetensors'}")

DUCKLING CORPUS READY
Stories: 200,000
Total tokens: 51,110,591
Live vocab: 8,000
Dead vocab: 2,000
Total vocab: 10,000

Files:
  data/corpus/tokens.safetensors
  data/corpus/dead_tokens.safetensors
