In [None]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [None]:
def merge(ids, pair, idx):
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            new_ids.append(idx)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    
    return new_ids

In [None]:
import unicodedata
from unicodedata import category

In [None]:
category('\n')

In [None]:
ch = '\n'
f"\\u{ord(ch):04x}"

In [None]:
ch

In [None]:
def replace_control_characters(s: str) -> str:
    chars = []
    for ch in s:
        if unicodedata.category(ch)[0] != 'C':
            chars.append(ch)
        else:
            chars.append(f"\\u{ord(ch):04x}") # escape
    
    return "".join(chars)

In [None]:
replace_control_characters("abcd\ne\r")

In [None]:
def render_token(t: bytes) -> str:
    s = t.decode('utf-8', errors='replace')
    return replace_control_characters(s)

In [None]:
# Encoding individual characters
# Encoding the character 'A' (U+0041)
utf8_encoded_A = b'\x41'
print(utf8_encoded_A)  # Output: b'A'

# Encoding the Euro sign 'â‚¬' (U+20AC)
utf8_encoded_euro = b'\xe2\x82\xac'
print(utf8_encoded_euro)  # Output: b'\xe2\x82\xac'

# Encoding the emoji 'ðŸ˜Š' (U+1F60A)
utf8_encoded_emoji = b'\xf0\x9f\x98\x8a'
print(utf8_encoded_emoji)  # Output: b'\xf0\x9f\x98\x8a'


# Encoding a string
# Encoding the string 'Hello, world!' in UTF-8
utf8_encoded_string = 'Hello, world!'.encode('utf-8')
print(utf8_encoded_string)  # Output: b'Hello, world!'

# Encoding a string with characters from multiple scripts
# Encoding the string 'ä½ å¥½, world!' containing Chinese characters (U+4F60 U+597D)
utf8_encoded_multilingual_string = 'ä½ å¥½, world!'.encode('utf-8')
print(utf8_encoded_multilingual_string)  # Output: b'\xe4\xbd\xa0\xe5\xa5\xbd, world!'

In [None]:
list(utf8_encoded_multilingual_string)

In [None]:
len(utf8_encoded_multilingual_string)

In [None]:
import math
math.pow(2, 23)

In [None]:
math.log(65535)

In [None]:
# understanding utf-8 encoding (smiley is reprsented using 4 bytes, registered using 2 and euro using 3)
# each number in the output list is [0, 255]
a = '\n\r\rðŸ˜ŠÂ®â‚¬'
#  list(b"".join([a.encode('utf-8')]))
list(a.encode('utf-8'))

In [None]:
replace_control_characters('\n\rabcd')

In [None]:
a.encode('utf-8')

In [None]:
render_token(a.encode('utf-8'))

In [None]:
a = {}
a[(1, 1)] = 2
a[(2, 2)] = 3
a[(3, 3)] = 4

In [None]:
for idx1, idx2 in a:
    print(idx1, idx2)

In [None]:
for i,j in a.items():
    print(i, j)

In [None]:
from pathlib import Path
p = Path('abcd')
model_file = p.with_suffix('.model')
model_file

In [None]:
with open(model_file, 'w') as f:
    f.write('abcdefg')

In [None]:
class Tokenizer:
    """Base class for tokenizer"""

    def __init__(self):
        # default vocab size is 256 (same as ascii chars), no merges, no patterns
        self.merges = {} # (int, int) -> int
        self.pattern = "" # str
        self.special_tokens = {} # str -> int eg. {'<|endoftext|>': 1}
        self.vocab = self._build_vocab() # int -> bytes
    
    def _build_vocab(self):
        vocab = {idx: bytes(idx) for idx in range(256)}
        # the fact that iteration order is same as order in which items are inserted is key here, otherwise we don't have vocab entries for previous merges
        for (p0, p1), idx in self.merges.items():
            vocab[idx] = vocab[p0] + vocab[p1]
        for special, idx in self.special_tokens.items():
            vocab[idx] = special.encode('utf-8')
        
        return vocab
    
    def train(self, text, vocab_size, verbose=False):
        raise NotImplementedError
    
    def encode(self, text):
        raise NotImplementedError
    
    def decode(self, ids):
        raise NotImplementedError
    
    def save(self, file_prefix):
        """
        Saves two files: file_prefix.vocab and file_prefix.model
        Similar to sentencepiece
        - model file is used for model loading, vocab is just for human viz.
        """
        file = Path(file_prefix)
        model_file = file.with_suffix('.model')
        with open(model_file, 'w') as f:
            # write version, pattern and merges
            f.write('minbpe v1\n')
            f.write(f"{self.pattern}\n")
            # special tokens
            f.write(f"{len(self.special_tokens)}\n")
            for special, idx in self.special_tokens.items():
                f.write(f"{special} {idx}\n")
            
            # merges dict
            for idx1, idx2 in self.merges: # write only the ids of the merge
                f.write(f"{idx1} {idx2}\n")
        
        # write the vocab, for human viz
        # vocab file is different than actual vocab, file is lossy but self.vocab is good.
        vocab_file = file.with_suffix('.vocab')
        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
        with open(vocab_file, 'w', encoding='utf-8') as f:
            for idx, token in self.vocab.items():
                # replaces some partial utf-8 seq into ? token, so this can't be decoded due to error = 'replace'
                s = render_token(token)
                if idx in inverted_merges:
                    idx0, idx1 = inverted_merges[idx]
                    s0 = render_token(self.vocab[idx0])
                    s1 = render_token(self.vocab[idx1])
                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n") # we should be able to change this
                else:
                    # print the bytes and special characters, double check the special characters part.
                    f.write(f"[{s}] {idx}")
    
    def load(self, model_file):
        """Invert the functionality in save, but only for model file"""
        assert model_file.endswith(".model")
        # read the model file
        merges = {}
        special_tokens = {}
        idx = 256

        with open(model_file, 'r', encoding='utf-8') as f: # this is decoding, but understand this part more.
            version = f.readline().strip()
            assert version == "minbpe v1"
            self.pattern = f.readline().strip()
            num_special = int(f.readline().strip())
            for _ in range(num_special):
                special, special_idx = f.readline().strip().split()
                special_tokens[special] = int(special_idx)

            # read the merges
            for line in f:
                idx1, idx2 = map(int, line.split())
                merges[(idx1, idx2)] = idx
                idx += 1
        
        self.merges = merges
        self.special_tokens = special_tokens
        self.vocab = self._build_vocab()

In [None]:
class BasicTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
    
    def train(self, text, vocab_size, verbose=False):
        assert vocab_size >= 256
        num_merges = vocab_size - 256
        ids = list(text.encode('utf-8'))

        merges = {}
        vocab = {idx: bytes([idx]) for idx in range(256)}
        idx = 256
        for i in range(num_merges):
            stats = get_stats(ids)
            top_pair = max(stats, key=stats.get)
            ids = merge(ids, top_pair, idx)
            merges[top_pair] = idx
            vocab[idx] = vocab[top_pair[0]] + vocab[top_pair[1]]
            if verbose:
                print(f"merge {i+1}/{merges}: {top_pair} -> {idx} {vocab[idx]} has {stats[top_pair]} occurences")
            idx +=1
        
        self.merges = merges
        self.vocab = vocab
    
    def decode(self, ids):
        """Converts ids to a string"""
        text_bytes = b"".join(self.vocab[idx] for idx in ids)
        text = text_bytes.decode("utf-8", errors="replace")
        return text
    
    def encode(self, text):
        """Retums ids from text"""
        ids = list(text.encode('utf-8'))
        while len(ids) >= 2:
            # find the element in stats that has the smallest associated value in merges
            stats = get_stats(ids)
            top_pair = min(stats, key=lambda p: self.merges.get(p, float('inf')))
            if top_pair not in self.merges:
                break
            ids = merge(ids, top_pair, self.merges[top_pair])
        
        return ids

In [None]:
tokenizer = BasicTokenizer()
tokenizer.train("How are you doing", 257)

In [None]:
tokenizer.decode(tokenizer.encode("abcd"))

In [1]:
import sys
sys.path

['/Users/htkumar/llms/tokenization/minbpe',
 '/Users/htkumar/anaconda3/envs/myenv/lib/python38.zip',
 '/Users/htkumar/anaconda3/envs/myenv/lib/python3.8',
 '/Users/htkumar/anaconda3/envs/myenv/lib/python3.8/lib-dynload',
 '',
 '/Users/htkumar/anaconda3/envs/myenv/lib/python3.8/site-packages']

In [2]:
import os
dirname = os.path.dirname(os.path.abspath(__file__))

NameError: name '__file__' is not defined

In [3]:
print(__file__)

NameError: name '__file__' is not defined

In [5]:
# globals()