In [52]:

from collections import Counter
import pickle
import os

def byte_level_tokenize(text):
    byte_tokens = list(text.encode("utf-8")) 
    return byte_tokens

def byte_level_detokenize(byte_tokens):
    text = bytes(byte_tokens).decode("utf-8")  
    return text

def get_byte_vocab(tokens):
    vocab = Counter()
    for pair in zip(tokens[:-1], tokens[1:]):
        vocab[pair] += 1
    return vocab

def get_most_frequent_pair(vocab):
    most_common = vocab.most_common()
    if len(most_common) > 0:
        return most_common[0][0]
    return None

def merge_pairs(tokens, pair, next_id):
    updated_tokens = []
    a, b = pair
    idx = 0
    while idx < len(tokens):
        if idx < len(tokens) - 1 and tokens[idx] == a and tokens[idx + 1] == b:
            updated_tokens.append(next_id)
            idx += 2 
        else:
            updated_tokens.append(tokens[idx])
            idx += 1

    return updated_tokens


In [62]:
class BPE():
    def __init__(self, vocab_size, dest_folder):
        self.vocab_size = vocab_size
        self.tokens = []
        self.merges = []
        self.token_to_char = {i: bytes([i]) for i in range(256)}
        self.corpus = ""
        self.dest_folder = dest_folder

    def __call__(self, corpus):
        self.corpus = corpus
        self.tokens, self.merges =  self.byte_pair()
        self.save()
        
    def byte_pair(self):
        tokens = byte_level_tokenize(self.corpus)
        print(f"Current tokens are {tokens}")
        print(f"Length of the current tokens are {len(tokens)}")
        tokens_copy = tokens.copy()
        next_id = 256
        merges = []
        for idx in range(self.vocab_size-255):
            vocab = get_byte_vocab(tokens)
            pair = get_most_frequent_pair(vocab)
            if pair is None:
                break
            self.token_to_char[next_id] = self.token_to_char[pair[0]]+self.token_to_char[pair[1]]
            merges.append((pair, next_id))
            print(f"Most frequent pair is {pair}")
            tokens = merge_pairs(tokens, pair, next_id)
            next_id += 1
            print(f"Next id is {next_id}")
        print(f"Final Tokens are {tokens}.")
        print(f"Length of the final tokens are {len(tokens)}")
        return tokens, merges

    def save(self):
        model_data = {
            'vocab_size': self.vocab_size,
            'tokens': self.tokens,
            'merges': self.merges,
            'token_to_char': self.token_to_char
        }
        filepath = os.path.join(self.dest_folder, "bpe.pk1")
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"BPE model successfully saved to {filepath}")

    def encode(self, corpus):
        tokens = byte_level_tokenize(corpus)
        for merge in self.merges:
            tokens = merge_pairs(tokens, merge[0], merge[1])
        return tokens

    def decode(self, tokens):
        return "".join(
            self.token_to_char[token].decode("utf-8") 
            if isinstance(self.token_to_char[token], bytes) 
            else self.token_to_char[token]  
            for token in tokens
        )

    def save(self):
        model_data = {
            'vocab_size': self.vocab_size,
            'tokens': self.tokens,
            'merges': self.merges,
            'token_to_char': self.token_to_char
        }
        filepath = os.path.join(self.dest_folder, "bpe.pk1")
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"BPE model successfully saved to {filepath}")

class BPETokenizer:
    @staticmethod
    def load(path):
        
        with open(path, 'rb') as f:
            data = pickle.load(f)
        tokenizer = BPE(data['vocab_size'], data['merges'])
        tokenizer.tokens = data['tokens']
        tokenizer.token_to_char = data['token_to_char']
        return tokenizer
    


In [63]:
corpus = '''To deal with this unknown word problem, modern tokenizers automatically in-
duce sets of tokens that include tokens smaller than words, called subwords. Sub-subwords
words can be arbitrary substrings, or they can be meaning-bearing units like the
morphemes -est or -er. (A morpheme is the smallest meaning-bearing unit of a lan-
guage; for example the word unwashable has the morphemes un-, wash, and -able.)
In modern tokenization schemes, most tokens are words, but some tokens are fre-
quently occurring morphemes or other subwords like -er. Every unseen word like
lower can thus be represented by some sequence of known subword units, such as
low and er, or even as a sequence of individual letters if necessary'''
#corpus = '''To deal with this'''

In [64]:
bpe = BPE(270, r"D:\ML_And_DeepLearning\ML_And_DeepLearning\Implementing Byte Pair Encoding Algorithm")
bpe(corpus)

Current tokens are [84, 111, 32, 100, 101, 97, 108, 32, 119, 105, 116, 104, 32, 116, 104, 105, 115, 32, 117, 110, 107, 110, 111, 119, 110, 32, 119, 111, 114, 100, 32, 112, 114, 111, 98, 108, 101, 109, 44, 32, 109, 111, 100, 101, 114, 110, 32, 116, 111, 107, 101, 110, 105, 122, 101, 114, 115, 32, 97, 117, 116, 111, 109, 97, 116, 105, 99, 97, 108, 108, 121, 32, 105, 110, 45, 10, 100, 117, 99, 101, 32, 115, 101, 116, 115, 32, 111, 102, 32, 116, 111, 107, 101, 110, 115, 32, 116, 104, 97, 116, 32, 105, 110, 99, 108, 117, 100, 101, 32, 116, 111, 107, 101, 110, 115, 32, 115, 109, 97, 108, 108, 101, 114, 32, 116, 104, 97, 110, 32, 119, 111, 114, 100, 115, 44, 32, 99, 97, 108, 108, 101, 100, 32, 115, 117, 98, 119, 111, 114, 100, 115, 46, 32, 83, 117, 98, 45, 115, 117, 98, 119, 111, 114, 100, 115, 10, 119, 111, 114, 100, 115, 32, 99, 97, 110, 32, 98, 101, 32, 97, 114, 98, 105, 116, 114, 97, 114, 121, 32, 115, 117, 98, 115, 116, 114, 105, 110, 103, 115, 44, 32, 111, 114, 32, 116, 104, 101, 121, 3

In [49]:
model = BPETokenizer.load(r"D:\ML_And_DeepLearning\ML_And_DeepLearning\Implementing Byte Pair Encoding Algorithm\bpe.pk1")

In [50]:
tokens = model.encode("Modern words such as tokenization are not necessary, What happens if the words aren't in corpus")

In [51]:
tokens

[77,
 111,
 100,
 101,
 114,
 110,
 32,
 119,
 111,
 114,
 100,
 115,
 32,
 115,
 117,
 99,
 104,
 32,
 97,
 115,
 32,
 116,
 111,
 107,
 101,
 110,
 105,
 122,
 97,
 116,
 105,
 111,
 110,
 32,
 97,
 114,
 101,
 32,
 110,
 111,
 116,
 32,
 110,
 101,
 99,
 101,
 115,
 115,
 97,
 114,
 121,
 44,
 32,
 87,
 104,
 97,
 116,
 32,
 104,
 97,
 112,
 112,
 101,
 110,
 115,
 32,
 105,
 102,
 32,
 116,
 104,
 101,
 32,
 119,
 111,
 114,
 100,
 115,
 32,
 97,
 114,
 101,
 110,
 39,
 116,
 32,
 105,
 110,
 32,
 99,
 111,
 114,
 112,
 117,
 115]