In [1]:
import os
import time

from cs336_basics import ROOT_DIR
from cs336_basics.bpe.tokenization import Tokenizer

import numpy as np

rng = np.random.default_rng(0)

# Sample Dataset and Initialize Tokenizer

In [2]:
def find_document_boundaries(filepath:str, split_token:bytes, chunk_size:int=4096):
    with open(filepath, "rb") as f:
        f.seek(0, os.SEEK_END)
        file_size = f.tell()
        pos = 0
        document_boundaries = [0]
        while pos < file_size:
            f.seek(pos)
            chunk = f.read(chunk_size)
            found_at = chunk.find(split_token)
            if found_at!=-1:
                document_boundaries.append(pos+found_at+len(split_token)+1)
                pos = document_boundaries[-1]
            else:
                pos += chunk_size
    return document_boundaries


def load_dataset(
    filepath:str, 
    num_documents:int=-1, # -1 means load all + shuffle 
    split_token:bytes=b"<|endoftext|>"
):
    doc_boundaries = find_document_boundaries(filepath, split_token)
    print("Dataset size: ", len(doc_boundaries)-1)
    if num_documents>0:
        sampled_indices = rng.choice(len(doc_boundaries), size=num_documents, replace=False)
    else:
        sampled_indices = list(range(len(doc_boundaries)-1))
    documents = []
    with open(filepath, "rb") as f:
        for index in sampled_indices:
            f.seek(doc_boundaries[index])
            raw_bytes = f.read(doc_boundaries[index+1]-doc_boundaries[index])
            documents.append(raw_bytes.decode("utf-8", errors="ignore"))
    return documents

In [3]:
owt_dataset = load_dataset(
    os.path.join(ROOT_DIR, "../data/owt_train.txt"),
    num_documents = 10
)
tiny_stories_dataset = load_dataset(
    os.path.join(ROOT_DIR, "../data/TinyStoriesV2-GPT4-train.txt"),
    num_documents = 10
)

Dataset size:  2394936
Dataset size:  2717685


In [4]:
owt_tokenizer = Tokenizer.from_files(
    os.path.join(ROOT_DIR, "../data/owt-train-vocab.json"), 
    os.path.join(ROOT_DIR, "../data/owt-train-merges.txt"),
    ["<|endoftext|>"]
)
tiny_stories_tokenizer = Tokenizer.from_files(
    os.path.join(ROOT_DIR, "../data/TinyStoriesV2-GPT4-train-vocab.json"), 
    os.path.join(ROOT_DIR, "../data/TinyStoriesV2-GPT4-train-merges.txt"),
    ["<|endoftext|>"]
)


# 2.7 Problem `tokenizer_experiments`: Experiments with tokenizers

Compression ratio of a string is computed by:

$$\frac{\text{number of bytes in the original string}}{\text{number of tokens in the encoded string}}$$ 

In [5]:
def compute_compression_ratio(documents:list[str], tokenizer: Tokenizer):
    encoded_length = []
    doc_bytes_length = []
    for doc in documents:
        encoded_length.append(len(tokenizer.encode(doc)))
        doc_bytes_length.append(len(doc.encode("utf-8")))
    return sum(doc_bytes_length)/sum(encoded_length)

## Part (a)

In [6]:
f"OWTs' compression ratio: {compute_compression_ratio(owt_dataset, owt_tokenizer)}"

"OWTs' compression ratio: 4.366535671100363"

In [7]:
f"Tiny Stories' compression ratio: {compute_compression_ratio(tiny_stories_dataset, tiny_stories_tokenizer)}"

"Tiny Stories' compression ratio: 4.104834849210149"

## Part (b)

In [8]:
f"OWT's compression ratio with TinyStories tokenizer: {compute_compression_ratio(owt_dataset, tiny_stories_tokenizer)}"

"OWT's compression ratio with TinyStories tokenizer: 3.356649044326962"

When used to tokenize OWT data, the compression ratio is less when using TS tokenizer as compared to OWT tokenizer. 

This makes sense since frequently occurring byte-pairs in the OWT dataset would likely be merged during the bpe training procedure for the OWT tokenizer. 
This is not the case for TS tokenizer, which is trained on a different dataset with a presumably different distribution of frequently occurring byte-pairs.

## Part (c)

In [9]:
def estimate_throughput(documents:list[str], tokenizer:Tokenizer):
    time_taken = []
    doc_byte_length = []
    for doc in documents:
        start = time.time()
        tokenizer.encode(doc)
        time_taken.append(time.time()-start)
        doc_byte_length.append(len(doc.encode("utf-8")))
    return sum(doc_byte_length)/sum(time_taken) # in bytes/second

In [10]:
owt_throughput=estimate_throughput(owt_dataset, owt_tokenizer)
f"Estimated throughput of OWT tokenizer: {owt_throughput} bytes/second"

'Estimated throughput of OWT tokenizer: 3280874.262319939 bytes/second'

In [11]:
f"Estimated time to tokenize 825GB (e.g., the Pile dataset) with OWT tokenizer: {8.25e9/owt_throughput} seconds"

'Estimated time to tokenize 825GB (e.g., the Pile dataset) with OWT tokenizer: 2514.573659450863 seconds'

In [12]:
ts_throughput = estimate_throughput(tiny_stories_dataset, tiny_stories_tokenizer)
f"Estimated throughput of TinyStories tokenizer: {ts_throughput} bytes/second"

'Estimated throughput of TinyStories tokenizer: 1967621.6860878605 bytes/second'

In [13]:
f"Estimated time to tokenize 825GB (e.g., the Pile dataset) with TinyStories tokenizer: {8.25e9/ts_throughput} seconds"

'Estimated time to tokenize 825GB (e.g., the Pile dataset) with TinyStories tokenizer: 4192.879179128752 seconds'

## Part (d)

Serializing token IDs as `uint16` makes sense because:

1. All vocabs are assigned positive integer IDs, so unsigned integer make sense.

2. TinyStories tokenizer has 10k vocab size (hence ID goes from 0 to 9999), while OWT tokenizer has 32k vocab size (hence ID goes from 0 to 31999).
`uint16` takes up 16 bits, so it has a range of 0-65535, which is more than enough for both tokenizers.

In [14]:
# serialize dataset (in string format) into token IDs, and save
def serialize_and_save(str_dataset_path:str, tokenizer: Tokenizer, save_path:str):
    documents = load_dataset(str_dataset_path, num_documents=-1)
    int_documents = np.empty(len(documents), dtype=object)
    for i, doc in enumerate(documents):
        encoded_ids = tokenizer.encode(doc)
        int_documents[i] = np.array(encoded_ids, dtype=np.uint16)
    np.save(save_path, int_documents)


In [15]:
dataset_paths = ["../data/owt_valid.txt","../data/owt_train.txt", "../data/TinyStoriesV2-GPT4-train.txt", "../data/TinyStoriesV2-GPT4-valid.txt" ]
tokenizers = [owt_tokenizer, owt_tokenizer, tiny_stories_tokenizer,tiny_stories_tokenizer ]
save_paths = ["../data/owt_valid_encoded.npy", "../data/owt_train_encoded.npy", "../data/TinyStories_train_encoded.npy", "../data/TinyStories_valid_encoded.npy"]
for dataset_path, tokenizer, save_path in zip(dataset_paths, tokenizers, save_paths):
    serialize_and_save(
        os.path.join(ROOT_DIR, dataset_path),
        tokenizer,
        os.path.join(ROOT_DIR, save_path)
    )

Dataset size:  58942


Dataset size:  2394936
Dataset size:  2717685
Dataset size:  27630
