# Tokenizer Training

I'm trying to train a model that knows one language (English) and one programming language (Python). I want it to be able to reason - I'm thinking I'll need to find some reasoning datasets or something to train on...

For now, I'll use Wikipedia and Python to train my tokenizers.

# Some papers

Model behavior at reduced scale: https://arxiv.org/abs/2305.17266

On width vs depth (linked in Chinchilla paper): https://arxiv.org/pdf/2006.12467.pdf

A vocabulary bottleneck: https://arxiv.org/pdf/2006.12467.pdf

Transformer architectures vary in depth/width ratios, but in language they're pretty consistent


Impact of tokenization on language models: https://arxiv.org/pdf/2204.08832.pdf

In [23]:
import datasets
from datasets import load_dataset
from tokenizers import (
    Tokenizer,
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
)
from transformers import (
    PreTrainedTokenizerFast,
    LlamaTokenizer,
    AutoTokenizer,
    GPTNeoXTokenizerFast,
    LlamaTokenizerFast,
)
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from dotenv import load_dotenv
import os
from composer.utils import reproducibility
import psutil

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR = "/datadrive/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/datadrive/hf_cache"


# Learning from the Llama Tokenizer

In [2]:
llama_tokenizer = LlamaTokenizer.from_pretrained(
    "openlm-research/open_llama_3b", cache_dir=CACHE_DIR
)


In [3]:
tokens = llama_tokenizer.encode("12013")
print(tokens)
print(llama_tokenizer.convert_ids_to_tokens(tokens))
print(llama_tokenizer.encode("hello"))
print(llama_tokenizer.encode("hello."))

[1, 31822, 31853, 31855, 31852, 31853, 31878]
['<s>', '▁', '1', '2', '0', '1', '3']
[1, 27701]
[1, 27701, 31843]


In [4]:
llama_tokenizer.special_tokens_map

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}

# Training my tokenizer

Want to take all of the datasets I have, merge them, and shuffle them for the tokenizer.

In [2]:
def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory used: {mem_info.rss / (1024**2)} MB")


In [3]:
seed = 42
reproducibility.seed_all(seed)

In [4]:
# Load all datasets
# streaming because https://huggingface.co/docs/datasets/v2.13.1/en/about_mapstyle_vs_iterable

print_memory_usage()
wikipedia_dataset: datasets.IterableDataset = load_dataset(
    "wikipedia",
    name="20220301.en",
    cache_dir=CACHE_DIR,
    use_auth_token=HF_TOKEN,
    split="train",
    # streaming=True,
).shuffle(
    seed=seed
)  # type: ignore
python_stack_dataset: datasets.IterableDataset = (
    load_dataset(
        "bigcode/the-stack-dedup",
        cache_dir=CACHE_DIR,
        data_dir="data/python",
        use_auth_token=HF_TOKEN,
        split="train",
        # streaming=True,
    )
    .shuffle(seed=seed)
    .rename_column("content", "text")
)  # type: ignore
wikihow_data: datasets.IterableDataset = load_dataset(
    "wikihow",
    name="all",
    data_dir=CACHE_DIR,
    cache_dir=CACHE_DIR,
    use_auth_token=HF_TOKEN,
    split="train",
    # streaming=True,
).shuffle(
    seed=seed
)  # type: ignore
print_memory_usage()


Memory used: 528.5703125 MB


Found cached dataset wikipedia (/datadrive/hf_cache/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Loading cached shuffled indices for dataset at /datadrive/hf_cache/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559/cache-9d1514c538bda335.arrow
Found cached dataset parquet (/datadrive/hf_cache/bigcode___parquet/bigcode--the-stack-dedup-d5df9d0729d2a04a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached shuffled indices for dataset at /datadrive/hf_cache/bigcode___parquet/bigcode--the-stack-dedup-d5df9d0729d2a04a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f3d5cb76f5b9c67e.arrow
Found cached dataset wikihow (/datadrive/hf_cache/wikihow/all-data_dir=%2Fdatadrive%2Fhf_cache/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e)
Loading cached shuffled indices for dataset at /datadrive/hf_cache/wikihow/all-data_dir=%

Memory used: 1897.77734375 MB


In [5]:
print("Dataset Sizes (in GB)")
print("Wikipedia:", wikipedia_dataset.info.splits["train"].num_bytes / (1024**3))
print("Python:", python_stack_dataset.info.splits["train"].num_bytes / (1024**3))
print("Wikihow:", wikihow_data.info.splits["train"].num_bytes / (1024**3))

Dataset Sizes (in GB)
Wikipedia: 18.88304591178894
Python: 66.9516989979893
Wikihow: 0.4779902445152402


In [6]:
# Creating tokenizer
# Want to control data mixture
# Wikipedia 30%,Python 40%, Wikihow 30%? Seems reasonable

dataset = datasets.interleave_datasets(
    [wikipedia_dataset, python_stack_dataset, wikihow_data],
    probabilities=[0.3, 0.4, 0.3],
    seed=seed,
)

In [25]:
from typing import Callable


def batch_generator(
    dataset: datasets.Dataset, batch_size: int = 1000, converter: Callable = lambda x: x
):
    for i in range(0, len(dataset), batch_size):
        yield converter(dataset[i : i + batch_size])


In [99]:
tokenizer = Tokenizer(
    models.BPE(
        vocab=None,
        merges=None,
        unk_token=None,
        dropout=None,
        fuse_unk=False,
    )
)
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFKC(), normalizers.Lowercase(), normalizers.StripAccents()]
)  # type: ignore
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [
        pre_tokenizers.ByteLevel(add_prefix_space=True),
        pre_tokenizers.Digits(individual_digits=True),
    ]
)  # type: ignore
tokenizer.post_processor = processors.ByteLevel()  # type: ignore
tokenizer.decoder = decoders.Sequence(
    [
        decoders.ByteLevel(),
    ]
)  # type: ignore


In [100]:
test_str = "Hello world! My name is John"
normalized = tokenizer.normalizer.normalize_str(test_str)
print("Normalized:", normalized)
pre_tokenized = tokenizer.pre_tokenizer.pre_tokenize_str(normalized)
print("Pre-tokenized:", pre_tokenized)
print("Encoded:", tokenizer.encode(test_str).tokens)


Normalized: hello world! my name is john
Pre-tokenized: [('hello', (0, 5)), ('Ġworld', (5, 11)), ('!', (11, 12)), ('Ġmy', (12, 15)), ('Ġname', (15, 20)), ('Ġis', (20, 23)), ('Ġjohn', (23, 28))]
Encoded: []


In [29]:
tokenizer.train_from_iterator(
    iterator=dataset.iter(1000),
    trainer=trainer,
)

In [18]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("tokenizer.json")




In [21]:
tokenizer.decode(tokenizer("Hello World")["input_ids"])


'▁ h e l l o ▁ w o r l d'