### 2.1
- printable vs no appearance: '\x00' vs '\x80'
- `some encoding -> binary -> another encoding` is a receipe for errors. See https://en.wikipedia.org/wiki/Specials_(Unicode_block)#:~:text=The%20replacement%20character%20%EF%BF%BD%20(often,of%20data%20to%20correct%20symbols. for examples.

In [65]:
print(chr(2049))
print(list(chr(2049).encode()))
print([bin(n) for n in list(chr(2049).encode())])

ࠁ
[224, 160, 129]
['0b11100000', '0b10100000', '0b10000001']


In [39]:
def gpt2_bytes_to_unicode() -> dict[int, str]:
    """
    Returns a mapping between every possible byte (an integer from 0 to 255) to a
    printable unicode string character representation. This function is taken
    from the GPT-2 code.

    For example, `chr(0)` is `\x00`, which is an unprintable character:

    >>> chr(0)
    '\x00'
    >>> print(chr(0))

    As a result, this function returns a dictionary `d` where `d[0]` returns `Ā`.
    The bytes that are visually printable keep their original string representation [1].
    For example, `chr(33)` returns `!`, and so accordingly `d[33]` returns `!`.
    Note in particular that the space character `chr(32)` becomes `d[32]`, which
    returns 'Ġ'.

    For unprintable characters, the function shifts takes the integer representing
    the Unicode code point of that character (returned by the Python `ord`) function
    and shifts it by 256. For example, `ord(" ")` returns `32`, so the the space character
    ' ' is shifted to `256 + 32`. Since `chr(256 + 32)` returns `Ġ`, we use that as the
    string representation of the space.

    This function can simplify the BPE implementation and makes it slightly easier to
    manually inspect the generated merges after they're serialized to a file.
    """
    # These 188 integers can used as-is, since they are not whitespace or control characters.
    # See https://www.ssec.wisc.edu/~tomw/java/unicode.html.
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    # now get the representations of the other 68 integers that do need shifting
    # each will get mapped chr(256 + n), where n will grow from 0...67 in the loop
    # Get printable representations of the remaining integers 68 integers.
    n = 0
    for b in range(2**8):
        if b not in bs:
            # If this integer isn't in our list of visually-representable
            # charcters, then map it to the next nice character (offset by 256)
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    characters = [chr(n) for n in cs]
    d = dict(zip(bs, characters))
    return d

### 2.6

In [1]:
from train_bpe import train_bpe

In [2]:
# input_path = "/home/azureuser/02-fun/cs336-assignment1-basics/data/TinyStoriesV2-GPT4-valid.txt"
input_path = "/home/azureuser/02-fun/cs336-assignment1-basics/tests/fixtures/corpus.en"
# vocab_size = 32_000
# vocab_size = 10_000
vocab_size = 500
special_tokens = ["<|endoftext|>"]
num_processes = 4

In [3]:
vocab, merges = train_bpe(
    input_path, vocab_size, special_tokens, num_processes
)



Trianing started.


  0%|          | 0/243 [00:00<?, ?it/s]

100%|██████████| 243/243 [00:00<00:00, 395.75it/s]


In [67]:
with open("test_vocab.json") as f:
    vocab_rdbk = json.load(f)
    vocab_rdbk = {int(k): v for k,v in vocab_rdbk.items()}

In [48]:
merges_2save = [tuple(map(decode_w_replace, merge)) for merge in merges]
with open("test_merges.txt", "w") as f:
    for merge in merges_2save:
        f.write(f"{tuple(merge)}\n")

In [50]:
"nd".encode()

b'nd'

In [15]:
import json

def decode_w_replace(x: bytes):
    return x.decode("utf-8", errors="replace")

def save_vocab_and_merges(
    vocab: dict[int, bytes],
    merges: list[tuple[bytes, bytes]],
    vocab_path: str,
    merges_path: str
):
    vocab_2save = {k:decode_w_replace(v) for k,v in vocab.items()}
    with open(vocab_path, "w") as f:
        json.dump(vocab_2save, f)
    merges_2save = [tuple(map(decode_w_replace, merge)) for merge in merges]
    with open(merges_path, "w") as f:
        for merge in merges_2save:
            f.write(f"{tuple(merge)}\n")

save_vocab_and_merges(vocab, merges, "test_vocab.json", "test_merges.txt")

In [261]:
from typing import Iterable, Iterator
from ast import literal_eval
import regex as re
import json

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

class Tokenizer:
    def __init__(
        self,
        vocab: dict[int, bytes],
        merges: Iterable[tuple[bytes, bytes]],
        special_tokens: list[str] | None = None
    ):
        self.vocab = vocab if vocab else {}
        self.merges = merges if merges else []
        self.special_tokens = special_tokens

    @classmethod
    def from_files(cls, vocab_filepath:str, merges_filepath:str, special_tokens: list[str] | None=None):
        gpt2_byte_decoder = {v: k for k, v in gpt2_bytes_to_unicode().items()}
        with open(vocab_filepath) as vocab_f:
            gpt2_vocab = json.load(vocab_f)
        gpt2_bpe_merges = []
        with open(merges_filepath) as f:
            for line in f:
                cleaned_line = line.rstrip()
                if cleaned_line and len(cleaned_line.split(" ")) == 2:
                    gpt2_bpe_merges.append(tuple(cleaned_line.split(" ")))
        # The GPT-2 tokenizer uses a remapped unicode encoding for bytes. Let's
        # just return the original bytes, so we don't force students to use
        # any particular encoding scheme.
        vocab = {
            gpt2_vocab_index: bytes([gpt2_byte_decoder[token] for token in gpt2_vocab_item])
            for gpt2_vocab_item, gpt2_vocab_index in gpt2_vocab.items()
        }
        # If any of the special tokens don't exist in the vocab, append them to the vocab.
        if special_tokens:
            for special_token in special_tokens:
                byte_encoded_special_token = special_token.encode("utf-8")
                if byte_encoded_special_token not in set(vocab.values()):
                    vocab[len(vocab)] = byte_encoded_special_token

        merges = [
            (
                bytes([gpt2_byte_decoder[token] for token in merge_token_1]),
                bytes([gpt2_byte_decoder[token] for token in merge_token_2]),
            )
            for merge_token_1, merge_token_2 in gpt2_bpe_merges
        ]
        return cls(vocab, merges, special_tokens)

    def encode(self, text: str) -> list[int]:
        vocab_reversed = {v:k for k,v in self.vocab.items()}
        pretokens = re.findall(PAT, text)
        tokens = []
        for pretoken in pretokens:
            pretoken  = [bytes([b]) for b in pretoken.encode()]
            token = pretoken[0]
            i = 1
            while i < len(pretoken):
                token_ = b"".join((token, pretoken[i]))
                if token_ in vocab_reversed:
                    token = token_
                    i += 1
                else:
                    tokens.append(token)
                    token = pretoken[i]
                    i += 1
            tokens.append(token)
        return [vocab_reversed[token] for token in tokens]

    def encode_iterable(self, iterable: Iterable[str]) -> Iterator[int]:
        for text in iterable:
            yield from self.encode(text)

    def decode(self, ids: list[int]):
        return b"".join([self.vocab[i] for i in ids]).decode("utf-8")

In [264]:
tokenizer = Tokenizer.from_files(VOCAB_PATH, MERGES_PATH, special_tokens=["<|endoftext|>"])

In [265]:
    reference_tokenizer = tiktoken.get_encoding("gpt2")
    test_string = "Héllò hôw are ü? 🙃"

    reference_ids = reference_tokenizer.encode(test_string)
    ids = tokenizer.encode(test_string)
    # assert ids == reference_ids

    # assert tokenizer.decode(ids) == test_string
    # assert reference_tokenizer.decode(reference_ids) == test_string

In [267]:
reference_ids

[39, 2634, 297, 127, 110, 289, 27083, 86, 389, 6184, 120, 30, 12520, 247, 225]

In [269]:
ids

[39,
 2634,
 297,
 127,
 110,
 289,
 27083,
 86,
 389,
 6184,
 120,
 30,
 220,
 8582,
 247,
 225]

In [272]:
vocab[220], vocab[8582]

(b' ', b'\xf0\x9f')

In [274]:
for i, merge in enumerate(MERGES):
    if merge == (b' ', b'\xf0\x9f'):
        print(i)

12264


In [273]:
vocab[12520]

b' \xf0\x9f'

In [263]:
tokenizer.encode("Hello, how are you?")

[15496, 11, 703, 389, 345, 30]

In [None]:
tokenizer.

First 20 token IDs:
11, 80, 111, 322, 420, 274, 258, 257, 334, 102, 294, 265, 102, 272, 301, 258, 298, 278, 117, 321, 

In [177]:
import sys
sys.path.append("../tests")
from common import gpt2_bytes_to_unicode

def get_tokenizer_from_vocab_merges_path(
    vocab_path: str | os.PathLike,
    merges_path: str | os.PathLike,
    special_tokens: list[str] | None = None,
):
    gpt2_byte_decoder = {v: k for k, v in gpt2_bytes_to_unicode().items()}
    with open(vocab_path) as vocab_f:
        gpt2_vocab = json.load(vocab_f)
    gpt2_bpe_merges = []
    with open(merges_path) as f:
        for line in f:
            cleaned_line = line.rstrip()
            if cleaned_line and len(cleaned_line.split(" ")) == 2:
                gpt2_bpe_merges.append(tuple(cleaned_line.split(" ")))
    # The GPT-2 tokenizer uses a remapped unicode encoding for bytes. Let's
    # just return the original bytes, so we don't force students to use
    # any particular encoding scheme.
    vocab = {
        gpt2_vocab_index: bytes([gpt2_byte_decoder[token] for token in gpt2_vocab_item])
        for gpt2_vocab_item, gpt2_vocab_index in gpt2_vocab.items()
    }
    # If any of the special tokens don't exist in the vocab, append them to the vocab.
    if special_tokens:
        for special_token in special_tokens:
            byte_encoded_special_token = special_token.encode("utf-8")
            if byte_encoded_special_token not in set(vocab.values()):
                vocab[len(vocab)] = byte_encoded_special_token

    merges = [
        (
            bytes([gpt2_byte_decoder[token] for token in merge_token_1]),
            bytes([gpt2_byte_decoder[token] for token in merge_token_2]),
        )
        for merge_token_1, merge_token_2 in gpt2_bpe_merges
    ]
    # return Tokenizer(vocab, merges, special_tokens)
    return vocab, merges, special_tokens

In [219]:
VOCAB_PATH = "/home/azureuser/02-fun/cs336-assignment1-basics/tests/fixtures/gpt2_vocab.json"
MERGES_PATH = "/home/azureuser/02-fun/cs336-assignment1-basics/tests/fixtures/gpt2_merges.txt"
VOCAB, MERGES, sptok = get_tokenizer_from_vocab_merges_path(VOCAB_PATH, MERGES_PATH)

In [238]:
len(MERGES)

50000

In [242]:
for merge in MERGES:
    if b'Hel' in merge:
        print(merge)

(b'Hel', b'p')
(b'Hel', b'per')


In [246]:
def encode(text: str) -> list[int]:
    vocab_reversed = {v:k for k,v in VOCAB.items()}
    pretokens = re.findall(PAT, text)
    print(pretokens)
    tokens = []
    for pretoken in pretokens:
        pretoken  = [bytes([b]) for b in pretoken.encode()]
        print(pretoken)
        token = pretoken[0]
        i = 1
        while i < len(pretoken):
            token_tmp = b"".join((token, pretoken[i]))
            if token_tmp in vocab_reversed:
                token = token_tmp
                i += 1
            else:
                tokens.append(token)
                print(token)
                token = pretoken[i]
                i += 1
        tokens.append(token)
        print(token)
    return [vocab_reversed[token] for token in tokens]

In [None]:
(b'Hel', b'l') 

In [247]:
encode("Hello, how are you?")

['Hello', ',', ' how', ' are', ' you', '?']
[b'H', b'e', b'l', b'l', b'o']
b'Hello'
[b',']
b','
[b' ', b'h', b'o', b'w']
b' how'
[b' ', b'a', b'r', b'e']
b' are'
[b' ', b'y', b'o', b'u']
b' you'
[b'?']
b'?'


['Hello', ',', ' how', ' are', ' you', '?']
[b'H', b'e', b'l', b'l', b'o']
b'Hello'
[b',']
b','
[b' ', b'h', b'o', b'w']
b' how'
[b' ', b'a', b'r', b'e']
b' are'
[b' ', b'y', b'o', b'u']
b' you'
[b'?']
b'?'


[15496, 11, 703, 389, 345, 30]

In [195]:
vr = {v:k for k,v in VOCAB.items()}
vr[b'He']

1544

In [216]:
VOCAB_PATH = "/home/azureuser/02-fun/cs336-assignment1-basics/tests/fixtures/gpt2_vocab.json"
MERGES_PATH = "/home/azureuser/02-fun/cs336-assignment1-basics/tests/fixtures/gpt2_merges.txt"

import tiktoken

def test_ascii_string_matches_tiktoken():
    reference_tokenizer = tiktoken.get_encoding("gpt2")
    # tokenizer = get_tokenizer_from_vocab_merges_path(
    tokenizer = Tokenizer.from_files(
        VOCAB_PATH, MERGES_PATH, ["<|endoftext|>"]
    )
    test_string = "Hello, how are you?"

    reference_ids = reference_tokenizer.encode(test_string)
    print(reference_ids)
    ids = tokenizer.encode(test_string)
    # assert ids == reference_ids

    tokenized_string = [tokenizer.decode([x]) for x in ids]

    return tokenized_string
    # assert tokenized_string == ["Hello", ",", " how", " are", " you", "?"]

    # assert tokenizer.decode(ids) == test_string
    # assert reference_tokenizer.decode(reference_ids) == test_string

In [217]:
tokenizer.encode("Hello, how are you?")

[1544, 18798, 11, 8169, 86, 610, 68, 27406, 84, 30]

In [218]:
test_ascii_string_matches_tiktoken()

[15496, 11, 703, 389, 345, 30]


['He', 'llo', ',', ' ho', 'w', ' ar', 'e', ' yo', 'u', '?']

In [163]:
tokenizer.vocab

{0: b'<|endoftext|>',
 1: b'\x00',
 2: b'\x01',
 3: b'\x02',
 4: b'\x03',
 5: b'\x04',
 6: b'\x05',
 7: b'\x06',
 8: b'\x07',
 9: b'\x08',
 10: b'\t',
 11: b'\n',
 12: b'\x0b',
 13: b'\x0c',
 14: b'\r',
 15: b'\x0e',
 16: b'\x0f',
 17: b'\x10',
 18: b'\x11',
 19: b'\x12',
 20: b'\x13',
 21: b'\x14',
 22: b'\x15',
 23: b'\x16',
 24: b'\x17',
 25: b'\x18',
 26: b'\x19',
 27: b'\x1a',
 28: b'\x1b',
 29: b'\x1c',
 30: b'\x1d',
 31: b'\x1e',
 32: b'\x1f',
 33: b' ',
 34: b'!',
 35: b'"',
 36: b'#',
 37: b'$',
 38: b'%',
 39: b'&',
 40: b"'",
 41: b'(',
 42: b')',
 43: b'*',
 44: b'+',
 45: b',',
 46: b'-',
 47: b'.',
 48: b'/',
 49: b'0',
 50: b'1',
 51: b'2',
 52: b'3',
 53: b'4',
 54: b'5',
 55: b'6',
 56: b'7',
 57: b'8',
 58: b'9',
 59: b':',
 60: b';',
 61: b'<',
 62: b'=',
 63: b'>',
 64: b'?',
 65: b'@',
 66: b'A',
 67: b'B',
 68: b'C',
 69: b'D',
 70: b'E',
 71: b'F',
 72: b'G',
 73: b'H',
 74: b'I',
 75: b'J',
 76: b'K',
 77: b'L',
 78: b'M',
 79: b'N',
 80: b'O',
 81: b'P',
 82: b

In [140]:
type(token_generator)

generator

In [133]:
tokenizer.decode(tokenizer.encode(" the bananas are green"))

' the bananas are green'

In [65]:
import os
os.listdir("../data/")

['owt_train.txt',
 'TinyStoriesV2-GPT4-train.txt',
 'TinyStoriesV2-GPT4-valid.txt']

In [69]:
special_token = "<|endoftext|>"
with open("../data/TinyStoriesV2-GPT4-valid.txt", "rb") as f:
    doc = f.read().split(special_token.encode())[0]

In [70]:
doc

b'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n'

In [None]:
jkj

In [10]:
import json
json.loads(json.dumps({1:vocab[1].decode()}))

{'1': '\x00'}

In [2]:
vocab = {idx: bytes([idx]) for idx in range(256)}
# for (p0, p1), idx in merges.items():
#     vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids):
  # given ids (list of integers), return Python string
  tokens = b"".join(vocab[idx] for idx in ids)
  text = tokens.decode("utf-8", errors="replace")
  return text

print(decode([128]))

�


In [15]:
ord(bytes([28]))

28

In [18]:
bytes([68])

b'D'

In [20]:
ord('&')

38

In [21]:
ord(b'&')

38

In [19]:
ord('D')

68

### 2.5
- `train_bpe_tinystoires`
    - Current memory usage: 5.24 MB
    - Peak memory usage: 116.74 MB

In [4]:
b'a' in (b'a' + b'\x80')

True

In [10]:
b'abc'[:2]

b'ab'

In [9]:
list(b'bc')

[98, 99]

In [6]:
b'ab'.replace(b'a', b'e')

b'eb'

In [12]:
from typing import Iterable
def _update_byte_tuple(byte_tuple: Iterable[bytes], merge_loc: int):
    """
    Merge the byte tuple at the merge location.
    """
    assert len(byte_tuple) > 1, "Cannot merge a byte tuple with length less than 2."
    prefix = byte_tuple[:merge_loc]
    tomerge = byte_tuple[merge_loc:merge_loc+2]
    suffix = byte_tuple[merge_loc+2:]
    new_byte_tuple = prefix + (b"".join(tomerge),) + suffix
    return new_byte_tuple, prefix, suffix

In [19]:
byte_tuple = tuple(bytes([c]) for c in 'xyz'.encode())
# tuple(bytes([b]) for b in pretoken)
_update_byte_tuple(byte_tuple, 1)

((b'x', b'yz'), (b'x',), ())

In [17]:
byte_tuple

(b'x', b'y', b'z')

In [18]:
tuple(b'xyz')

(120, 121, 122)