In [1]:
import tokenizers
from tokenizers.tools import EncodingVisualizer

In [2]:
dir = "data/tinystories"

import os

files = os.listdir(dir)
files = [f for f in files if f.endswith(".txt")]
files = [os.path.join(dir, f) for f in files]
files

['data/tinystories/the_odyssey.txt',
 'data/tinystories/moby_dick.txt',
 'data/tinystories/alice_wonderland.txt',
 'data/tinystories/frankenstein.txt',
 'data/tinystories/dracula.txt',
 'data/tinystories/a_tale_of_two_cities.txt',
 'data/tinystories/pride_and_prejudice.txt',
 'data/tinystories/the_complete_works_of_william_shakespeare.txt',
 'data/tinystories/a_room_with_a_view.txt',
 'data/tinystories/metamorphosis.txt',
 'data/tinystories/the_great_gatspy.txt',
 'data/tinystories/adventures_of_huckleberry_finn.txt',
 'data/tinystories/the_iliad.txt']

In [3]:
model = tokenizers.models.BPE()
trainer = tokenizers.trainers.BpeTrainer(vocab_size=4096, special_tokens=["[PAD]", "[SOS]", "[EOS]", "[MASK]", "[UNK]"], show_progress=True)

In [4]:
tokenizer = tokenizers.Tokenizer(model=model)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel()

tokenizer.add_special_tokens(["[PAD]", "[SOS]", "[EOS]", "[MASK]", "[UNK]"])

tokenizer.train(files, trainer=trainer)
print(tokenizer.get_vocab_size())




4096


In [5]:
viz = EncodingVisualizer(tokenizer)

In [6]:
with open("data/tinystories/the_complete_works_of_william_shakespeare.txt", "r") as f:
    text = f.read()
print(f"Text has {len(text):,} characters")

Text has 5,378,662 characters


In [7]:
encoded = tokenizer.encode(text)
print(f"Encoded text has {len(encoded.ids):,} tokens")

Encoded text has 1,886,287 tokens


In [8]:
viz(text[:2048])

In [9]:
tokenizer.decoder = tokenizers.decoders.ByteLevel()

In [10]:
print(tokenizer.decode(encoded.ids[:512]))

 The Project Gutenberg eBook of The Complete Works of William Shakespeare
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Complete Works of William Shakespeare

Author: William Shakespeare

Release date: January 1, 1994 [eBook #100]
                Most recently updated: January 18, 2024

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***
﻿The Complete Works of William Shakespeare

by William Shakespeare




                    Contents

    THE SONNETS
    ALL’S WELL THAT ENDS WELL
    THE TRAGEDY OF ANTONY AND CLEOPATRA
   

In [11]:
tokenizer.save("projects/4-stories/tokenizer.json")