In [2]:
#Imports
import regex as re


In [16]:
sample_text = """Petitioner John Angus Smith and his companion went from Tennessee to Florida to buy cocaine; they hoped to resell it at a profit. While in Florida, they met petitioner's acquaintance, Deborah Hoag. Hoag agreed to, and in fact did, purchase cocaine for petitioner.\
    She then accompanied petitioner and his friend to her motel room, where they were joined by a drug dealer. While Hoag listened, petitioner and the dealer discussed petitioner's MAC–10 firearm, which had been modified to operate as an automatic. The MAC–10 apparently is a \
    favorite among criminals. It is small and compact, lightweight, and can be equipped with a silencer. Most important of all, it can be devastating: A fully automatic MAC–10 can fire more than 1,000 rounds per minute. The dealer expressed his interest in becoming the owner \
    of a MAC–10, and petitioner promised that he would discuss selling the gun if his arrangement with another potential buyer fell through. Unfortunately for petitioner, Hoag had contacts not only with narcotics traffickers but also with law enforcement officials. \
    In fact, she was a confidential informant. Consistent with her post, she informed the Broward County Sheriff's Office of petitioner's activities. The Sheriff's Office responded quickly, sending an undercover officer to Hoag's motel room. Several others were assigned to \
    keep the motel under surveillance. Upon arriving at Hoag's motel room, the undercover officer presented himself to petitioner as a pawnshop dealer. Petitioner, in turn, presented the officer with a proposition: He had an automatic MAC–10 and silencer with which he might \
    be willing to part. Petitioner then pulled the MAC–10 out of a black canvas bag and showed it to the officer. The officer examined the gun and asked petitioner what he wanted for it. Rather than asking for money, however, petitioner asked for drugs. He was willing to trade his MAC–10, he said, \
    for two ounces of cocaine. The officer told petitioner that he was just a pawnshop dealer and did not distribute narcotics. Nonetheless, he indicated that he wanted the MAC–10 and would try to get the cocaine. The officer then left, promising to return within an hour."""

In [1]:
#Creating a byte tokenizer
def byte_tokenizer(text):
    ''' 
    Maps a string to a list of byte tokens
    '''
    tokens = text.encode('utf-8')
    tokens = list(map(int, tokens))
    return tokens

In [15]:
#Function to introduce the byte-pair encoding
def get_pairs(bts):
    """ 
    Returns a dictionary of all byte pairs in the input and their frequency
    """
    counts = {}
    for pair in zip(bts, bts[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [19]:
#Function to merge the most frequent pair
def merge_pairs(bts, pair, idx):
    """ Merges the most frequent pair in the input
    Args:
        bts: list of byte tokens
        pair: pair to merge
        idx: new byte token to replace the pair
    Returns:
        new_bts: list of byte tokens with the most frequent pair merged
    """
    new_bts = []
    i = 0
    while i < len(bts):
        if i < len(bts) - 1 and bts[i] == pair[0] and bts[i + 1] == pair[1]:
            new_bts.append(idx)
            i += 2
        else:
            new_bts.append(bts[i])
            i += 1
    return new_bts

In [20]:
sample_tokens = byte_tokenizer(sample_text)

In [28]:
#First try of an implementation of the BPE algorithm for 20 merges
#Hyperparameters
vocab_size = 276
num_merges = vocab_size - 256
bts = list(sample_tokens)

#Main loop
merges = {}
for i in range(num_merges):
    pairs = get_pairs(bts)
    if not pairs:
        break
    best_pair = max(pairs, key=pairs.get)
    btx = 256 + i
    print('##-------------------------------------------------##')
    print(f'  Merging pair {best_pair} into new byte token {btx}')
    bts = merge_pairs(bts, best_pair, btx)
    merges[best_pair] = btx


##-------------------------------------------------##
  Merging pair (101, 114) into new byte token 256
##-------------------------------------------------##
  Merging pair (101, 32) into new byte token 257
##-------------------------------------------------##
  Merging pair (32, 97) into new byte token 258
##-------------------------------------------------##
  Merging pair (116, 105) into new byte token 259
##-------------------------------------------------##
  Merging pair (100, 32) into new byte token 260
##-------------------------------------------------##
  Merging pair (116, 104) into new byte token 261
##-------------------------------------------------##
  Merging pair (32, 32) into new byte token 262
##-------------------------------------------------##
  Merging pair (111, 110) into new byte token 263
##-------------------------------------------------##
  Merging pair (105, 110) into new byte token 264
##-------------------------------------------------##
  Merging pair (

In [9]:
text = "batatinha, quando nasce, esparrama pelo chão. Menininha, quando morre, põe a mão no coração."
tokens = byte_tokenizer(text)
dic = bpe_lite(tokens)
dic

{(98, 97): 1,
 (97, 116): 2,
 (116, 97): 1,
 (116, 105): 1,
 (105, 110): 3,
 (110, 104): 2,
 (104, 97): 2,
 (97, 44): 2,
 (44, 32): 4,
 (32, 113): 2,
 (113, 117): 2,
 (117, 97): 2,
 (97, 110): 2,
 (110, 100): 2,
 (100, 111): 2,
 (111, 32): 5,
 (32, 110): 2,
 (110, 97): 1,
 (97, 115): 1,
 (115, 99): 1,
 (99, 101): 1,
 (101, 44): 2,
 (32, 101): 1,
 (101, 115): 1,
 (115, 112): 1,
 (112, 97): 1,
 (97, 114): 1,
 (114, 114): 2,
 (114, 97): 2,
 (97, 109): 1,
 (109, 97): 1,
 (97, 32): 2,
 (32, 112): 2,
 (112, 101): 1,
 (101, 108): 1,
 (108, 111): 1,
 (32, 99): 2,
 (99, 104): 1,
 (104, 195): 1,
 (195, 163): 3,
 (163, 111): 3,
 (111, 46): 2,
 (46, 32): 1,
 (32, 77): 1,
 (77, 101): 1,
 (101, 110): 1,
 (110, 105): 2,
 (32, 109): 2,
 (109, 111): 1,
 (111, 114): 2,
 (114, 101): 1,
 (112, 195): 1,
 (195, 181): 1,
 (181, 101): 1,
 (101, 32): 1,
 (32, 97): 1,
 (109, 195): 1,
 (110, 111): 1,
 (99, 111): 1,
 (97, 195): 1,
 (195, 167): 1,
 (167, 195): 1}

In [14]:
chr(97), chr(116)

('a', 't')

In [None]:
#GPT-4 style BPE tokenizer
class BasicTokenizer:

    def __init__(self):
        self.vocab = None
        self.merge_table = None
        
            
    def train(self, text, vocab_size, verbose=False):
        ''' Train the tokenizer on the given text. '''
        pass

    def encode(self, text):
        pass

    def decode(self, ids):
        pass