# Pipe line

## Steps
1. Instantiate & Sanity Check Tokenizer
2. Clean & Normalize Raw Text
3. Encode Entire Corpus into Token IDs
4. Split into Train / Validation
5. Chunk into fixed length blocks
6. Prepare for smoke test
7. Define & Save model config

# 1. **Instantiate & Sanity-Check Tokenizer**

   * Load your vocab/merges into a `ByteLevelBPETokenizer`.
   * Encode a few hand-picked sentences to verify you get reasonable token IDs and that special tokens (in the previous "tokenizer.ipynb," I've customized to `<|UNKNOWN|>`, `<|START|>`, `<|END|>`) are present.

## **If you're testing full, turn this to true**

In [1]:
isfull=True

In [2]:
bpe_path = "../bpe"

In [3]:
from tokenizers import ByteLevelBPETokenizer

# load tokenizer
tokenizer = ByteLevelBPETokenizer(
    bpe_path + "/bpe_model-vocab.json",
    bpe_path + "/bpe_model-merges.txt",
    lowercase=False,
    add_prefix_space=True
)

# special tokens
tokenizer.add_special_tokens([
    "<|PAD|>", 
    "<|UNKNOWN|>", 
    "<|START|>", 
    "<|END|>", 
    "<|SYSTEM|>", 
    "<|USER|>", 
    "<|ASSISTANT|>", 
    "<|EOT|>",
    "<|INFOSTART|>",
    "<|INFOEND|>"
])

# sanity check
print("Vocab size:", tokenizer.get_vocab_size())
for t in ["<|PAD|>", "<|UNKNOWN|>", "<|START|>", "<|END|>", "<|SYSTEM|>", "<|USER|>", "<|ASSISTANT|>", "<|EOT|>","<|INFOSTART|>","<|INFOEND|>"]:
    print(f"{t:12s}→ id {tokenizer.token_to_id(t)}")

# logs
print("\n--------LOG--------")
print("NOTE: Ġ is a leading space\n")

enc = tokenizer.encode("<|START|> Hello, world! <|END|>")
print("Tokens:", enc.tokens)
dec1 = tokenizer.decode(enc.ids)
print("Decoded:", dec1)
print("Original: <|START|> Hello, world! <|END|>")
print("(it shouldn't print special tokens)")
print("\n")

enc2 = tokenizer.encode("This is a test.")
print("Tokens:", enc2.tokens)
dec2 = tokenizer.decode(enc2.ids)
print("Decoded:", dec2)
print("Original: This is a test.")
print("\n")

enc3 = tokenizer.encode("𝌆 This-is–weird?!")
print("Tokens:", enc3.tokens)
dec3 = tokenizer.decode(enc3.ids)
print("Decoded:", dec3)
print("Original: 𝌆 This-is–weird?!")
print("\n")

enc4 = tokenizer.encode("😀 emoji becomes ð Ł ĺ Ģ")
print("Tokens:", enc4.tokens)
dec4 = tokenizer.decode(enc4.ids)
print("Decoded:", dec4)
print("Original: 😀 emoji becomes ð Ł ĺ Ģ")
print("\n")


Vocab size: 60000
<|PAD|>     → id 0
<|UNKNOWN|> → id 1
<|START|>   → id 2
<|END|>     → id 3
<|SYSTEM|>  → id 4
<|USER|>    → id 5
<|ASSISTANT|>→ id 6
<|EOT|>     → id 7
<|INFOSTART|>→ id 8
<|INFOEND|> → id 9

--------LOG--------
NOTE: Ġ is a leading space

Tokens: ['<|START|>', 'ĠH', 'ello', ',', 'Ġworld', '!', 'Ġ', '<|END|>']
Decoded:  Hello, world! 
Original: <|START|> Hello, world! <|END|>
(it shouldn't print special tokens)


Tokens: ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', '.']
Decoded:  This is a test.
Original: This is a test.


Tokens: ['Ġ', 'ð', 'Ŀ', 'Į', 'Ĩ', 'ĠThis', '-', 'is', 'âĢĵ', 'we', 'ird', '?!']
Decoded:  𝌆 This-is–weird?!
Original: 𝌆 This-is–weird?!


Tokens: ['Ġ', 'ð', 'Ł', 'ĺ', 'Ģ', 'Ġem', 'oj', 'i', 'Ġbecomes', 'ĠÃ°', 'ĠÅ', 'ģ', 'ĠÄ', 'º', 'ĠÄ', '¢']
Decoded:  😀 emoji becomes ð Ł ĺ Ģ
Original: 😀 emoji becomes ð Ł ĺ Ģ




# 2. **Clean & Normalize Raw Text**
   * Unicode-normalize (NFC), strip control characters, normalize punctuation.

In [4]:
import os
import re
import unicodedata
from tqdm import tqdm

# stuffs
if isfull:
    INPUT  = "../materials/all_books.txt"
    OUTPUT = "../materials/all_books_clean.txt"
else:
    INPUT  = "../materials_small/all_books.txt"
    OUTPUT = "../materials_small/all_books_clean.txt"

# prog bar
filesize = os.path.getsize(INPUT)

# configure control characters
# i used "https://www.ascii-code.com/" for below
CTRL_CHARS = re.compile(
    "[" +
    # 0-31 is ASCII control range. But we want to exclude 
    # tabs (\t = 9), newlines (\n = 10), and carriage returns (\r = 13)
    "".join(chr(c) for c in range(0,32) if c not in (9,10,13)) +
    # also, add all characters in 127-159
    "".join(chr(c) for c in range(127,160)) +
    "]"
)

# cleaning things
with open(INPUT, "r", encoding="utf-8", errors="ignore") as fin, \
     open(OUTPUT, "w", encoding="utf-8") as fout:

    # prog bar
    pbar = tqdm(total=filesize,
                unit="B", unit_scale=True, unit_divisor=1024,
                desc="Cleaning")

    in_body = True
    for line in fin:
        pbar.update(len(line.encode("utf-8")))

        text = unicodedata.normalize("NFC", line)
        # if we ever find CTRL_CHARS, replace that to empty string
        text = CTRL_CHARS.sub("", text)

        fout.write(text)

    pbar.close()

print(f"✓ Done cleaning → {OUTPUT}")


Cleaning:  98%|█████████▊| 27.1G/27.6G [14:02<00:16, 34.5MB/s]

✓ Done cleaning → ../materials/all_books_clean.txt





Real run:

RTX3080: <br/>
runtime: 14m 44s<br/>

RTX3060: <br/>
runtime: 38m 4s

# 3. **Encode Entire Corpus into Token IDs**

   * Read your cleaned text in chunks (per line).
   * Run your tokenizer over each chunk (e.g. `encode_batch`) to produce a flat 1D array of token IDs.

In [5]:
bpe_vocab_path = "../bpe/bpe_model-vocab.json"
bpe_merges_path = "../bpe/bpe_model-merges.txt"

if isfull:
    clean_books_path = "../materials/all_books_clean.txt"
    clean_books_id_path = "../materials/all_books_ids.npy"
else:
    clean_books_path = "../materials_small/all_books_clean.txt"
    clean_books_id_path = "../materials_small/all_books_ids.npy"

In [6]:
from tokenizers import ByteLevelBPETokenizer
import numpy as np
from itertools import islice
from tqdm import tqdm
import os

# life-good code
# if there's existing all_books_ids.npy, remove that
if os.path.exists(clean_books_id_path):
    print(f"Removing existing file: {clean_books_id_path}")
    os.remove(clean_books_id_path)

# load tokenizer & configure special tokens
tokenizer = ByteLevelBPETokenizer(
    bpe_vocab_path,
    bpe_merges_path,
    lowercase=False,
    add_prefix_space=True
)
tokenizer.add_special_tokens(["<|PAD|>", "<|UNKNOWN|>", "<|START|>", "<|END|>", "<|SYSTEM|>", "<|USER|>", "<|ASSISTANT|>", "<|EOT|>","<|INFOSTART|>","<|INFOEND|>"])

# settings
INPUT       = clean_books_path # we're going to process with the cleaned version
BATCH_LINES = 100_000 # encode 100,000 line per iteration
DTYPE       = np.int32 # we want tokenid to be an integer type

# count total tokens
print("Counting total tokens...")
total_tokens = 0
with open(INPUT, "r", encoding="utf-8", errors="ignore") as f:
    while True:
        batch = list(islice(f, BATCH_LINES))
        if not batch:
            break
        encs = tokenizer.encode_batch(batch)
        total_tokens += sum(len(enc.ids) for enc in encs)
        print(f"Processed {total_tokens:,} tokens", end="\r")

print(f"Total tokens: {total_tokens:,}")

# allocate memmap
# since the real one has over billions of tokens, we'll use memmap to prevent RAM crash (due to memory)
mm = np.lib.format.open_memmap(
    clean_books_id_path,
    mode="w+",
    dtype=DTYPE,
    shape=(total_tokens,)
)

# fill memmap with streaming tokenization
print("Streaming tokenization into memmap...")
filesize = os.path.getsize(INPUT)
pbar = tqdm(total=filesize, unit="B", unit_scale=True, desc="Tokenizing")

# just for progress bar
offset = 0

# main loop
with open(INPUT, "r", encoding="utf-8", errors="ignore") as f:
    while True:
        # grab 100,000 tokens in each iteration
        batch = list(islice(f, BATCH_LINES))
        if not batch:
            # if we have 0 tokens, that means we're done.
            # so end the loop
            break
        
        # encode the tokens
        encs = tokenizer.encode_batch(batch)
        for enc in encs:
            length = len(enc.ids)
            mm[offset:offset + length] = enc.ids
            offset += length

        # update bar by bytes read
        pbar.update(sum(len(line.encode("utf-8")) for line in batch))
        pbar.set_postfix(tokens=f"Processed {offset:,} tokens")

pbar.close()
del mm # now that we're done, we're gonna terminate memmap

print(f"✓ Wrote {total_tokens:,} token IDs to all_books_ids.npy")

Counting total tokens...
Total tokens: 7,373,891,517ens
Streaming tokenization into memmap...


Tokenizing:  98%|█████████▊| 29.1G/29.6G [1:16:49<01:32, 6.30MB/s, tokens=Processed 7,373,891,517 tokens]


✓ Wrote 7,373,891,517 token IDs to all_books_ids.npy


**Small one:** <br/>
total tokens: 338,565,967 tokens <br/>
RTX3060: <br/>
runtime: 12m 13s

**Real one:** <br/>
total tokens: 7,690,145,918 tokens (30.3GB) <br/>
RTX3060: <br/>
runtime: 2h 6m 8s

RTX3080: <br/>
runtime: 2h 10m 47.8s <br/>
(well... this is with lots of background applications... so, yup. reasonable)

# 4. **Split into Train / Validation**

   * Choose a split (I'm using 95% for train and 10% for validation).
   * Slice your ID array accordingly.

In [7]:
import numpy as np
from tqdm import trange

# load .npy
if isfull:
    ids = np.load("../materials/all_books_ids.npy")
else:
    ids = np.load("../materials_small/all_books_ids.npy")

# split train & validation
split_ratio = 0.95
split_index = int(len(ids) * split_ratio)

train_ids = ids[:split_index]
valid_ids = ids[split_index:]

def save_npy_with_progress(filename, array, chunk_size=1_000_000):
    mm = np.lib.format.open_memmap(filename, mode='w+', 
                                   dtype=array.dtype, shape=array.shape)
    for start in trange(0, array.shape[0], chunk_size,
                        desc=f"Saving {filename}", unit="tokens"):
        end = min(start + chunk_size, array.shape[0])
        mm[start:end] = array[start:end]
    del mm

# save with progress
if isfull:
    save_npy_with_progress("../materials/train_ids.npy", train_ids)
    save_npy_with_progress("../materials/valid_ids.npy", valid_ids)
else:
    save_npy_with_progress("../materials_small/train_ids.npy", train_ids)
    save_npy_with_progress("../materials_small/valid_ids.npy", valid_ids)


Saving ../materials/train_ids.npy: 100%|██████████| 7006/7006 [03:56<00:00, 29.63tokens/s] 
Saving ../materials/valid_ids.npy: 100%|██████████| 369/369 [00:45<00:00,  8.19tokens/s]


# 5. **Chunk into Fixed-Length Blocks**

   * context will be 1024 (tokens).
   * Break each subset into non-overlapping blocks of exactly that length.

In [8]:
import numpy as np
from tqdm import tqdm

# param
BLOCK_SIZE = 1024

# chunk Train IDs into BLOCK_SIZE
if isfull:
    train_ids = np.load("../materials/train_ids.npy", mmap_mode="r")
else:
    train_ids = np.load("../materials_small/train_ids.npy", mmap_mode="r")
num_train_blocks = len(train_ids) // BLOCK_SIZE

if isfull: 
    writetrain = "../materials/train_blocks.npy"
else:
    writetrain = "../materials_small/train_blocks.npy"

train_blocks = np.lib.format.open_memmap(
    writetrain, mode="w+",
    dtype=train_ids.dtype, shape=(num_train_blocks, BLOCK_SIZE)
)
for i in tqdm(range(num_train_blocks), desc="Chunking train blocks"):
    start = i * BLOCK_SIZE
    train_blocks[i] = train_ids[start:start + BLOCK_SIZE]
del train_blocks  # flush to disk

# chunk Valid IDs into BLOCK_SIZE
if isfull:
    valid_ids = np.load("../materials/valid_ids.npy", mmap_mode="r")
else:
    valid_ids = np.load("../materials_small/valid_ids.npy", mmap_mode="r")

num_valid_blocks = len(valid_ids) // BLOCK_SIZE

if isfull:
    writevalid = "../materials/valid_blocks.npy"
else:
    writevalid = "../materials_small/valid_blocks.npy"

valid_blocks = np.lib.format.open_memmap(
    writevalid, mode="w+",
    dtype=valid_ids.dtype, shape=(num_valid_blocks, BLOCK_SIZE)
)
for i in tqdm(range(num_valid_blocks), desc="Chunking valid blocks"):
    start = i * BLOCK_SIZE
    valid_blocks[i] = valid_ids[start:start + BLOCK_SIZE]

del valid_blocks  # flush to disk

print(f"✓ Created {num_train_blocks} train blocks and {num_valid_blocks} valid blocks.")


Chunking train blocks: 100%|██████████| 6841012/6841012 [01:53<00:00, 60291.39it/s]
Chunking valid blocks: 100%|██████████| 360053/360053 [00:05<00:00, 60106.07it/s]


✓ Created 6841012 train blocks and 360053 valid blocks.


## Sanity Check

In [9]:
import numpy as np
if isfull:
    train_blocks = np.load("../materials/train_blocks.npy")
else:
    train_blocks = np.load("../materials_small/train_blocks.npy")

# decode block 0 back to text and eyeball it
from tokenizers import ByteLevelBPETokenizer
tok = ByteLevelBPETokenizer("../bpe/bpe_model-vocab.json", "../bpe/bpe_model-merges.txt", lowercase=False, add_prefix_space=True)
tok.add_special_tokens(["<|PAD|>", "<|UNKNOWN|>", "<|START|>", "<|END|>", "<|SYSTEM|>", "<|USER|>", "<|ASSISTANT|>", "<|EOT|>","<|INFOSTART|>","<|INFOEND|>"])
sample_text = tok.decode(train_blocks[0].tolist())
print(sample_text[:500])

# verify that the output is readable. (it's a part of clean books.)


 *** START OF THE PROJECT GUTENBERG EBOOK 1 ***
 
 
     NOTE:  This file combines the first two Project Gutenberg
     files, both of which were given the filenumber #1. There are
     several duplicate files here. There were many updates over
     the years.  All of the original files are included in the
     "old" subdirectory which may be accessed under the "More
     Files" listing in the PG Catalog of this file. No changes
     h


In [10]:
if isfull:
    train_blocks = np.load("../materials/train_blocks.npy", mmap_mode="r")
    valid_blocks = np.load("../materials/valid_blocks.npy", mmap_mode="r")
else:
    train_blocks = np.load("../materials_small/train_blocks.npy", mmap_mode="r")
    valid_blocks = np.load("../materials_small/valid_blocks.npy", mmap_mode="r")

print("Train blocks shape:", train_blocks.shape)
print("Valid blocks shape:", valid_blocks.shape)

# the shape should be whatever you see in log after executing first cell in 5th one
# and 1024 since BLOCK_SIZE = 1024


Train blocks shape: (6841012, 1024)
Valid blocks shape: (360053, 1024)


# 6. **Prepare for smoke test**

   * keep a tiny “smoke‐test” subset

In [11]:
import numpy as np

# load your full block arrays
if isfull:
    train_blocks = np.load("../materials/train_blocks.npy")
    valid_blocks = np.load("../materials/valid_blocks.npy")
else:
    train_blocks = np.load("../materials_small/train_blocks.npy")
    valid_blocks = np.load("../materials_small/valid_blocks.npy")

# save a tiny “smoke-test” subset
#  first 40 train blocks, first 20 valid blocks
if isfull:
    np.save("../materials/train_smoke.npy", train_blocks[:40])
    np.save("../materials/valid_smoke.npy", valid_blocks[:20])
else:
    np.save("../materials_small/train_smoke.npy", train_blocks[:40])
    np.save("../materials_small/valid_smoke.npy", valid_blocks[:20])


print(f"✓ Smoke-test train blocks: {train_blocks[:40].shape[0]} saved as train_smoke.npy")
print(f"✓ Smoke-test valid blocks: {valid_blocks[:20].shape[0]} saved as valid_smoke.npy")


✓ Smoke-test train blocks: 40 saved as train_smoke.npy
✓ Smoke-test valid blocks: 20 saved as valid_smoke.npy


# DONE! <br/>
If you see followings in materials folder:
- all_books_clean.txt
- all_books_ids.npy
- all_books.txt
- train_blocks.npy
- train_ids.npy
- train_smoke.npy
- valid_blocks.npy
- valid_ids.npy
- valid_smoke.npy

you're good to go!