In [None]:
import unicode_data  # Optional, for normalization if desired
import torch

class BasicTokenizer:
    def __init__(self):
        # The initial vocabulary consists of the 256 bytes (0–255)
        self.merges = {}  # (int, int) -> int
        self.vocab = {i: bytes([i]) for i in range(256)}

    def train(self, text, vocab_size, verbose=False):
        """
        Trains the tokenizer by finding the most frequent byte pairs
        and merging them until the desired vocabulary size is reached.
        """
        assert vocab_size >= 256
        num_merges = vocab_size - 256
        text_bytes = text.encode("utf-8")  # Convert to UTF-8 bytes
        ids = list(text_bytes)  # List of integers (0–255)

        for i in range(num_merges):
            # 1. Count frequencies of consecutive pairs
            stats = self._get_stats(ids)
            if not stats:
                break
            
            # 2. Find the most frequent pair
            top_pair = max(stats, key=stats.get)
            idx = 256 + i
            
            # 3. Register the merge and update the vocabulary
            if verbose:
                print(f"Merging {top_pair} into new token {idx}")
            
            self.merges[top_pair] = idx
            self.vocab[idx] = self.vocab[top_pair[0]] + self.vocab[top_pair[1]]
            
            # 4. Replace the pair in the ID sequence
            ids = self._merge(ids, top_pair, idx)

    def encode(self, text):
        """Converts text into a list of tokens (IDs)."""
        text_bytes = text.encode("utf-8")
        ids = list(text_bytes)
        while len(ids) >= 2:
            stats = self._get_stats(ids)
            # Find the pair that appears in our trained merges with the smallest index
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break  # No more merges possible
            idx = self.merges[pair]
            ids = self._merge(ids, pair, idx)
        return ids

    def decode(self, ids):
        """Converts a list of IDs back into text (string)."""
        text_bytes = b"".join(self.vocab[idx] for idx in ids)
        # Use 'replace' to handle malformed byte sequences
        return text_bytes.decode("utf-8", errors="replace")

    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i + 1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids





In [None]:
# --- Example usage ---
tokenizer = BasicTokenizer()
corpus = "This is an example text to train the tokenizer, like in Karpathy's video."
tokenizer.train(corpus, vocab_size=260, verbose=True)

tokens = tokenizer.encode("Hello Karpathy")
print(f"Tokens: {tokens}")
print(f"Decoded: {tokenizer.decode(tokens)}")

In [None]:
# pip install tiktoken

import tiktoken

# 1. Configure the tokenizer (o200k_base for GPT-4o, cl100k_base for GPT-4/3.5)
encoding = tiktoken.get_encoding("o200k_base")

text = "Hola, probando el tokenizador de OpenAI en Google Colab."

# 2. Encode text into tokens
tokens = encoding.encode(text)

# 3. Display results
print(f"Token IDs: {tokens}")
print(f"Total tokens: {len(tokens)}")
print(f"Recovered text: {encoding.decode(tokens)}")

# View breakdown of individual tokens
print("Breakdown:", [encoding.decode_single_token_bytes(t) for t in tokens])


ModuleNotFoundError: No module named 'tiktoken'

# Exercices

Modify the tokenizer class to include the regex for splitting words before performing BPE. (Original: Modificar la clase tokenizer para que incluya la regex para dividir palabras antes de hacer BPE)



In [None]:
import regex as re

class BasicTokenizer:
    def __init__(self):
        # Initial vocabulary: 256 bytes
        self.merges = {}              # (int, int) -> int
        self.vocab = {i: bytes([i]) for i in range(256)}

        # GPT-style regex for pre-tokenization
        self.pattern = re.compile(
            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
        )

    # ------------------------------------------------------------------
    # Training (regex → byte-level BPE)
    # ------------------------------------------------------------------
    def train(self, text, vocab_size, verbose=False):
        assert vocab_size >= 256
        num_merges = vocab_size - 256

        # Regex split
        words = self.pattern.findall(text)

        # Convert to byte ids
        ids = []
        for w in words:
            ids.extend(list(w.encode("utf-8")))

        for i in range(num_merges):
            stats = self._get_stats(ids)
            if not stats:
                break

            pair = max(stats, key=stats.get)
            idx = 256 + i

            if verbose:
                print(f"Merging {pair} -> {idx}")

            self.merges[pair] = idx
            self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]]
            ids = self._merge(ids, pair, idx)

    # ------------------------------------------------------------------
    # Encoding / decoding
    # ------------------------------------------------------------------
    def encode(self, text):
        tokens = []

        words = self.pattern.findall(text)
        for w in words:
            ids = list(w.encode("utf-8"))

            while len(ids) >= 2:
                stats = self._get_stats(ids)
                pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
                if pair not in self.merges:
                    break
                ids = self._merge(ids, pair, self.merges[pair])

            tokens.extend(ids)

        return tokens

    def decode(self, ids):
        return b"".join(self.vocab[i] for i in ids).decode("utf-8", errors="replace")

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------
    def _get_stats(self, ids):
        counts = {}
        for a, b in zip(ids, ids[1:]):
            counts[(a, b)] = counts.get((a, b), 0) + 1
        return counts

    def _merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids


In [None]:
# --- Example usage ---
tokenizer = BasicTokenizer()
corpus = "This is an example text to train the tokenizer, like in Karpathy's video."
tokenizer.train(corpus, vocab_size=260, verbose=True)

tokens = tokenizer.encode("Hello Karpathy")
print(f"Tokens: {tokens}")
print(f"Decoded: {tokenizer.decode(tokens)}")