In [1]:
import os
import json
import regex as re
import requests

import torch


def bytes_to_unicode():
    """
    Every possible byte (really an integer 0..255) gets mapped by OpenAI to a unicode
    character that represents it visually. Some bytes have their appearance preserved
    because they don't cause any trouble. These are defined in list bs. For example:
    chr(33) returns "!", so in the returned dictionary we simply have d[33] -> "!".
    However, chr(0), for example, is '\x00', which looks ugly. So OpenAI maps these
    bytes, into new characters in a range where chr() returns a single nice character.
    So in the final dictionary we have d[0] -> 'Ā' instead, which is just chr(0 + 2**8).
    In particular, the space character is 32, which we can see by ord(' '). Instead,
    this function will shift space (32) by 256 to 288, so d[32] -> 'Ġ'.
    So this is just a simple one-to-one mapping of bytes 0..255 into unicode characters
    that "look nice", either in their original form, or a funny shifted character
    like 'Ā', or 'Ġ', etc.
    """
    # the 188 integers that render fine in their original form and need no shifting
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:] # all integers b in bs will simply map to chr(b) in the output dict
    # now get the representations of the other 68 integers that do need shifting
    # each will get mapped chr(256 + n), where n will grow from 0...67 in the loop
    n = 0
    for b in range(2**8):
        if b not in bs:
            # if this byte is "ugly" then map it to the next available "nice" character
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    d = dict(zip(bs, cs))
    return d

def get_pairs(word):
    """
    Return all bigrams as a set of tuples, of consecutive elements in the iterable word.
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

In [2]:
encoder_local_file = "trained_tokenizer/tokenizer.json"

with open(encoder_local_file, 'r') as f:
    encoder = json.load(f)
# load encoder.json that has the raw mappings from token -> bpe index

encode = encoder["model"]["vocab"]
bpe_data = encoder["model"]["merges"]
# light postprocessing: strip the version on first line and the last line is a blank
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data]

In [3]:
byte_encoder = bytes_to_unicode()

def normalize_value(value):
    return "".join([byte_encoder[i] for i in list(value.encode("utf-8"))])

In [5]:
import math

for i in range(11):
    if i == 0:
        value = "00" 
    else:
        value = "0"*(1 - int(math.log10(i))) + f"{i}"
    value = normalize_value(value)
    encode[value] = max(list(encode.values())) + 1
    bpe_merges.append((value[:-1], value[-1]))

In [6]:
for i in range(11, 100):
    value = "0"*(1 - int(math.log10(i))) + f"{i}"
    value = normalize_value(value)
    encode[value] = max(list(encode.values())) + 1
    bpe_merges.append((value[:-1], value[-1]))

In [7]:
for i in range(101):
    if i == 0:
        value = "000"
    else:
        value = "0"*(2 - int(math.log10(i))) + f"{i}"
    value = normalize_value(value)
    encode[value] = max(list(encode.values())) + 1
    bpe_merges.append((value[:-1], value[-1]))

In [8]:
for i in range(101, 1000):
    value = "0"*(2 - int(math.log10(i))) + f"{i}"
    value = normalize_value(value)
    encode[value] = max(list(encode.values())) + 1
    bpe_merges.append((value[:-1], value[-1]))

In [13]:
encoder["model"]["merges"] = [" ".join(i) for i in bpe_merges]

In [15]:
encoder["model"]["vocab"] = encode

In [17]:
with open(f"trained_tokenizer/post_processed_tokenizer.json", 'w') as f:
    json.dump(encoder, f)
# load encoder.json that has the raw map