# GPT-2 
---

In [8]:
import sys
sys.path.append('..')

In [1]:
import tensorflow as tf
import random

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
tf.enable_eager_execution()

In [3]:
def nprint(*args):
    print(*args, end='\n\n-----------------\n')

In [4]:
def cprint(things):
    print(*things, sep='\n')

In [14]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [11]:
from src.encoder import get_pairs
from src.encoder import get_encoder
from src.encoder import bytes_to_unicode

In [5]:
import os
import json
import regex as re
from functools import lru_cache

---
### The Encoder class

In [24]:
class Encoder:
    """
    Attributes: 
    - encoder/decoder (dicts)
    - errors: option for bytearray.decode()
    - byte_encoder/decoder (dicts)
    - bpe_ranks
    """
                                            # errors='replace'
                                            # an option for the bytearray() conversion function used below.
                                            # cf Python doc: Replace with a suitable replacement marker; 
                                            # Python will use the official U+FFFD REPLACEMENT CHARACTER 
                                            # for the built-in codecs on decoding, and ‘?’ on encoding.  
    def __init__(self, encoder, bpe_merges, errors='replace'):
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}            # simply reversing from {k:v} to {v:k}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()                          # our look-up table function
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} # reversing again, for bytes
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))  # { x0: 0, x1: 1, ...}
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
                            # Regexes:
                            # contractions
                            # words: one or more of any letter (\p{L}), preceded by optional space
                            # numbers: (\p{N}), preceded by optional space
                            # no code: NOT a space followed by one letter & one or more of any number (code)
                            #          preceded by optional space, all this one or more times
                            # no single space: one or more spaces not followed non-whitespace, negative lookahead: (?!\S) 
                            # one or more spaces ok
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", flags=re.IGNORECASE)
                                                                                                                # adding ignorecase, as mentioned above

    def bpe(self, token):

        # don't do the work twice, save words on the go
        if token in self.cache:
            return self.cache[token]

        word = tuple(token)     # turn token to char tuple
        pairs = get_pairs(word) # get all char pairs: ('w','o','r','d') > { ('w', 'o'), ('o', 'r'), ('r', 'd') }

        # if word was only one symbol?
        if not pairs:
            return token

        while True:
                                                                            
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
                                                                            # float('inf'): It acts as an unbounded upper value for 
            if bigram not in self.bpe_ranks:                                # comparison. This is useful for finding lowest 
                break                                                       # values for something. 
                                                                            # https://stackoverflow.com/a/34264749
            first, second = bigram                                          
            new_word = []                                                   

            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)   # returns index of searched element (first), starting at i
                    new_word.extend(word[i:j]) # append items from iterable to the end of the array
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):

        bpe_tokens = []                                 
        
        # for each token found by our regex (words, numbers, more than one space, punctuation)
        for token in re.findall(self.pat, text):     

            # encode to utf-8 (char > int), then encode to byte, then join in a string
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))

            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))

        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
        return text

A bit of dissection... We can reuse the `117M` encoder we uploaded above.

In [68]:
encoder = json117

In [69]:
enc_self_encoder = encoder
enc_self_decoder = {v:k for k,v in enc_self_encoder.items()}            # simply reversing from {k:v} to {v:k}
enc_self_byte_encoder = bytes_to_unicode()                          # our look-up table function
enc_self_byte_decoder = {v:k for k, v in enc_self_byte_encoder.items()} # reversing again, for bytes
enc_self_bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))  # { x0: 0, x1: 1, ...}
enc_self_cache = {}

### Regexes

In [70]:
# contractions
# words: one or more of any letter (\p{L}), preceded by optional space
# numbers: (\p{N}), preceded by optional space
# punctuation: not a space, a letter or a number, one or more times, preceded by optional space
# no single space: one or more spaces not followed non-whitespace, negative lookahead: (?!\S) 
# one or more spaces ok
enc_self_pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", flags=re.IGNORECASE)

As a test:

In [71]:
reg_test = "Won't? I'm I'll I'd numbers 9012 also multiple spaces ' ?        no code a9877   b002 x0 b9d8"
print(re.findall(enc_self_pat, reg_test))

['Won', "'t", '?', ' I', "'m", ' I', "'ll", ' I', "'d", ' numbers', ' 9012', ' also', ' multiple', ' spaces', " '", ' ?', '       ', ' no', ' code', ' a', '9877', '  ', ' b', '002', ' x', '0', ' b', '9', 'd', '8']


We can also separate the regexes, to see what they do:

In [72]:
enc_self_reg1 = re.compile(r""" ?[^\s\p{L}\p{N}]+""", flags=re.IGNORECASE)
enc_self_reg2 = re.compile(r"""\s+(?!\S)""", flags=re.IGNORECASE)
enc_self_reg3 = re.compile(r"""\s+""", flags=re.IGNORECASE) # all spaces
print(re.findall(enc_self_reg1, reg_test))
print(re.findall(enc_self_reg2, reg_test))
print(re.findall(enc_self_reg3, reg_test))

["'", '?', "'", "'", "'", " '", ' ?']
['       ', '  ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '        ', ' ', ' ', '   ', ' ', ' ']


---
### The bpe function

In [73]:
def enc_bpe(token):

    # don't do the work twice, save words on the go
    if token in enc_self_cache:
        return enc_self_cache[token]

    word = tuple(token)     # turn token to char tuple
    pairs = get_pairs(word) # get all char pairs: ('w','o','r','d') > { ('w', 'o'), ('o', 'r'), ('r', 'd') }

    # if word was only one symbol?
    if not pairs:
        return token

    while True:

        bigram = min(pairs, key = lambda pair: enc_self_bpe_ranks.get(pair, float('inf')))
                                                                        # float('inf'): It acts as an unbounded upper value for 
        if bigram not in enc_self_bpe_ranks:                            # comparison. This is useful for finding lowest 
            break                                                       # values for something. 
                                                                        # https://stackoverflow.com/a/34264749
        first, second = bigram                                          
        new_word = []                                                   

        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)   # returns index of searched element (first), starting at i
                new_word.extend(word[i:j]) # append items from iterable to the end of the array
                i = j
            except:
                new_word.extend(word[i:])
                break

            if word[i] == first and i < len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_word = tuple(new_word)
        word = new_word
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)
    
    # returns a string
    word = ' '.join(word)
    enc_self_cache[token] = word
    return word

Dissect!

In [74]:
txt = "antidystopianarianism"
token = re.findall(enc_self_pat, txt)[0]  
token = ''.join(enc_self_byte_encoder[b] for b in token.encode('utf-8'))
print(token)

antidystopianarianism


In [75]:
tktpl = tuple(token)
print(tktpl)
tkpairs = get_pairs(tktpl)
print(tkpairs)

('a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i', 'a', 'n', 'a', 'r', 'i', 'a', 'n', 'i', 's', 'm')
{('t', 'o'), ('p', 'i'), ('s', 'm'), ('d', 'y'), ('a', 'n'), ('n', 'a'), ('r', 'i'), ('i', 's'), ('n', 'i'), ('i', 'a'), ('i', 'd'), ('n', 't'), ('o', 'p'), ('s', 't'), ('a', 'r'), ('t', 'i'), ('y', 's')}


As a reminder, the `enc_self_bpe_ranks` is a `dict` containing pairs of elements.

In [76]:
print('First ten elements:', list(enc_self_bpe_ranks.items())[:10], sep='\n')
print()
index = random.randint(0,len(enc_self_bpe_ranks)-11)
print('Random ten elements:', list(enc_self_bpe_ranks.items())[index:index+10], sep='\n')

First ten elements:
[(('Ġ', 't'), 0), (('Ġ', 'a'), 1), (('h', 'e'), 2), (('i', 'n'), 3), (('r', 'e'), 4), (('o', 'n'), 5), (('Ġt', 'he'), 6), (('e', 'r'), 7), (('Ġ', 's'), 8), (('a', 't'), 9)]

Random ten elements:
[(('Ġtreasure', 'r'), 43172), (('I', 'AS'), 43173), (('Ġcolon', 'ists'), 43174), (('Ġin', 'und'), 43175), (('ĠWW', 'F'), 43176), (('ĠCon', 'verted'), 43177), (('6', '000'), 43178), (('out', 'side'), 43179), (('ĠApp', 'earance'), 43180), (('ĠRel', 'ic'), 43181)]


Method:
- take the `min()` pair according to its ranking in `enc_self_bpe_ranks`;
- for that, use a `lambda` function, that gets the appropriate ranking number, and if the pair is not found, return `float('inf')`, namely, don't select it.

Recap:   
`dict.get()` [documentation](https://docs.python.org/3/library/stdtypes.html?highlight=dict%20get#dict.get): 
> Return the value for key if key is in the dictionary, else default. If default is not given, it defaults to None, so that this method never raises a KeyError.

In [77]:
for tkpair in tkpairs:
    print('Pair:', tkpair, '| Rank:', enc_self_bpe_ranks.get(tkpair, float('inf')))

    tkbigram = min(tkpairs, key = lambda tkpair: enc_self_bpe_ranks.get(tkpair, float('inf')))
print()
print('Returned pair:', tkbigram, '| Rank:',  enc_self_bpe_ranks.get(tkbigram))

Pair: ('t', 'o') | Rank: 1206
Pair: ('p', 'i') | Rank: 14159
Pair: ('s', 'm') | Rank: 5540
Pair: ('d', 'y') | Rank: 9636
Pair: ('a', 'n') | Rank: 16
Pair: ('n', 'a') | Rank: 2360
Pair: ('r', 'i') | Rank: 124
Pair: ('i', 's') | Rank: 15
Pair: ('n', 'i') | Rank: 8205
Pair: ('i', 'a') | Rank: 288
Pair: ('i', 'd') | Rank: 56
Pair: ('n', 't') | Rank: 173
Pair: ('o', 'p') | Rank: 148
Pair: ('s', 't') | Rank: 45
Pair: ('a', 'r') | Rank: 27
Pair: ('t', 'i') | Rank: 20003
Pair: ('y', 's') | Rank: 637

Returned pair: ('i', 's') | Rank: 15


An error mechanism is used below to trigger the `except:` block. 

In [78]:
i = 0
while i < len(tktpl):
    i = tktpl.index('n', i)
    print(tktpl.index('n', i), '|', tktpl[:i]) # index will throw an error if nothing is found
    i += 1

1 | ('a',)
12 | ('a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i', 'a')
17 | ('a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i', 'a', 'n', 'a', 'r', 'i', 'a')


ValueError: tuple.index(x): x not in tuple

In [79]:
tktpl = tuple(token)
print('token tuple:', tktpl)
frst, scnd = tkbigram     
print('first, second:', frst, scnd)
nw_wd = []
print('\n-------------\n')

i = 0
while i < len(tktpl):
    try:
        print('try:')
        print('word before:', nw_wd)
        j = tktpl.index(frst, i)   # returns index of searched element (frst), starting at i
        print('adding tktpl[i:j]:', tktpl[i:j])
        nw_wd.extend(tktpl[i:j]) # append items from iterable to the end of the array
        print('word after:', nw_wd, '| indices:', i, j)
        i = j
        print('\n-------------\n')
    except:
        print('except:')
        print('word before:', nw_wd)
        print('adding the end, tktpl[i:]:', tktpl[i:])
        nw_wd.extend(tktpl[i:]) # add the end of the word
        print('word after', nw_wd, '| indices:', i, j)
        print('\n-------------\n')        
        print('breaking: word has now the first pair:', tkbigram, 'inside it')
        break # the end: the word has now the first pair in it, and the rest single tokens!

    # if
    # token i is first
    # i smaller than last index
    # token after i is scnd
    if tktpl[i] == frst and i < len(tktpl)-1 and tktpl[i+1] == scnd:
        print('if:')
        print('word before:', nw_wd)
        nw_wd.append(frst+scnd)
        print('word after:', nw_wd, '| indices:', i, j)
        print('\n-------------\nnow incrementing i by 2 to', i+2, '\n')
        i += 2
    else:
        print('else:')
        print('word before:', nw_wd, '| adding tktpl[i]:', tktpl[i])
        nw_wd.append(tktpl[i])
        print('word after:', nw_wd,'| indices:', i, j)
        print('\n-------------\nnow incrementing i by 1 to', i+1, '\n')
        i += 1

token tuple: ('a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i', 'a', 'n', 'a', 'r', 'i', 'a', 'n', 'i', 's', 'm')
first, second: i s

-------------

try:
word before: []
adding tktpl[i:j]: ('a', 'n', 't')
word after: ['a', 'n', 't'] | indices: 0 3

-------------

else:
word before: ['a', 'n', 't'] | adding tktpl[i]: i
word after: ['a', 'n', 't', 'i'] | indices: 3 3

-------------
now incrementing i by 1 to 4 

try:
word before: ['a', 'n', 't', 'i']
adding tktpl[i:j]: ('d', 'y', 's', 't', 'o', 'p')
word after: ['a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p'] | indices: 4 10

-------------

else:
word before: ['a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p'] | adding tktpl[i]: i
word after: ['a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i'] | indices: 10 10

-------------
now incrementing i by 1 to 11 

try:
word before: ['a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p', 'i']
adding tktpl[i:j]: ('a', 'n', 'a', 'r')
word after: ['a', 'n', 't', 'i', 'd', 'y', 's', 't', 'o', 'p

In [80]:
token = 'dreadful'
tktpl = tuple(token)
tkpairs = get_pairs(tktpl)
print('Token tuple:', tktpl)
print('Pairs:', tkpairs)
print('-'*30)

while True:
                                        # get pair rank, return 'inf' if not found
    bgrm = min(tkpairs, key = lambda pair: enc_self_bpe_ranks.get(pair, float('inf')))
    print('Chosen bigram:', bgrm, '| Rank:', enc_self_bpe_ranks.get(bgrm))
    
    # if not in ranks stop
    if bgrm not in enc_self_bpe_ranks: 
        print('---------------')
        print('Bigram ain\'t in ranks, bugger off!')
        break 
        
    # split the pair in two
    frst, scnd = bgrm                                          
    nw_wd = []                                                   
    i = 0
    
    while i < len(tktpl):
        
        try:
            j = tktpl.index(frst, i) # lowest index of frst, start at i
            nw_wd.extend(tktpl[i:j]) 
            i = j
        except:
            nw_wd.extend(tktpl[i:])
            break
            
        if tktpl[i] == frst and i < len(tktpl)-1 and tktpl[i+1] == scnd:
            nw_wd.append(frst+scnd)
            i += 2
        else:
            nw_wd.append(tktpl[i])
            i += 1
            
    nw_wd = tuple(nw_wd)
    tktpl = nw_wd
    print('Token tuple updated:', tktpl)
    
    if len(tktpl) == 1:
        print()
        print('Length now', len(tktpl), 'hence breaking')
        break
    else:
        print('Repairing, pairs:', get_pairs(tktpl), end='\n\n')
        tkpairs = get_pairs(tktpl)

Token tuple: ('d', 'r', 'e', 'a', 'd', 'f', 'u', 'l')
Pairs: {('f', 'u'), ('r', 'e'), ('u', 'l'), ('a', 'd'), ('e', 'a'), ('d', 'r'), ('d', 'f')}
------------------------------
Chosen bigram: ('r', 'e') | Rank: 4
Token tuple updated: ('d', 're', 'a', 'd', 'f', 'u', 'l')
Repairing, pairs: {('d', 're'), ('u', 'l'), ('a', 'd'), ('re', 'a'), ('f', 'u'), ('d', 'f')}

Chosen bigram: ('a', 'd') | Rank: 68
Token tuple updated: ('d', 're', 'ad', 'f', 'u', 'l')
Repairing, pairs: {('d', 're'), ('u', 'l'), ('ad', 'f'), ('re', 'ad'), ('f', 'u')}

Chosen bigram: ('u', 'l') | Rank: 121
Token tuple updated: ('d', 're', 'ad', 'f', 'ul')
Repairing, pairs: {('d', 're'), ('f', 'ul'), ('ad', 'f'), ('re', 'ad')}

Chosen bigram: ('f', 'ul') | Rank: 657
Token tuple updated: ('d', 're', 'ad', 'ful')
Repairing, pairs: {('d', 're'), ('ad', 'ful'), ('re', 'ad')}

Chosen bigram: ('re', 'ad') | Rank: 705
Token tuple updated: ('d', 'read', 'ful')
Repairing, pairs: {('read', 'ful'), ('d', 'read')}

Chosen bigram: ('r

The result is turned again into a string, and will be unpacked by the `encode` function below.

In [81]:
a = ' '.join(tktpl)
print(a)
for b in a:
    print(b, enc_self_encoder[b])

d read ful
d 67


KeyError: ' '

---
### The *encode* function

In [82]:
def enc_encode(text):
    bpe_tokens = []                                 
    # for each token found by our regex (words, numbers, more than one space, punctuation)
    for token in re.findall(enc_self_pat, text):     
        # encode to utf-8 (char > int), then encode to byte, then join in a string
        token = ''.join(enc_self_byte_encoder[b] for b in token.encode('utf-8'))
        bpe_tokens.extend(enc_self_encoder[bpe_token] for bpe_token in enc_bpe(token).split(' '))
    return bpe_tokens

As a reminder, our byte encoder:

In [83]:
list(enc_self_byte_encoder.items())[:10]

[(33, '!'),
 (34, '"'),
 (35, '#'),
 (36, '$'),
 (37, '%'),
 (38, '&'),
 (39, "'"),
 (40, '('),
 (41, ')'),
 (42, '*')]

Steps:
- parse text for regexes, return as list;
- encode each word as a list of utf-8 codes;
- take these codes and transfer to bytes;
- apply the `bpe` function;
- encode the result.

In [87]:
text = 'antidisestablishmentarianism'

In [88]:
bpe_tkns = []
for token in re.findall(enc_self_pat, text):
    print('Original token:', token)

    print('In UTF-8:', [b for b in token.encode('utf-8')])
    print('Now in bytes:', [enc_self_byte_encoder[b] for b in token.encode('utf-8')])
    token = ''.join(enc_self_byte_encoder[b] for b in token.encode('utf-8'))

    print('Result of the enc_bpe fn:', enc_bpe(token))
    print('Result of encoder:', [enc_self_encoder[bpe_token] for bpe_token in enc_bpe(token).split(' ')])
    bpe_tkns.extend(enc_self_encoder[bpe_token] for bpe_token in enc_bpe(token).split(' '))
    print()
nprint()
print('End result', bpe_tkns)

Original token: antidisestablishmentarianism
In UTF-8: [97, 110, 116, 105, 100, 105, 115, 101, 115, 116, 97, 98, 108, 105, 115, 104, 109, 101, 110, 116, 97, 114, 105, 97, 110, 105, 115, 109]
Now in bytes: ['a', 'n', 't', 'i', 'd', 'i', 's', 'e', 's', 't', 'a', 'b', 'l', 'i', 's', 'h', 'm', 'e', 'n', 't', 'a', 'r', 'i', 'a', 'n', 'i', 's', 'm']
Result of the enc_bpe fn: ant idis establishment arian ism
Result of encoder: [415, 29207, 44390, 3699, 1042]



-----------------
End result [415, 29207, 44390, 3699, 1042]


---
### The *decode* function

In [222]:
def enc_decode(tokens):
    # first decode from number to char
    # then 
    text = ''.join([enc_self_decoder[token] for token in tokens])
    text = bytearray([enc_self_byte_decoder[c] for c in text]).decode('utf-8', errors='replace')
    return text

Method:
- decode numbers to bytes;
- decode bytes to utf-8;
- decode utf-8.

In [228]:
txt = "Hullo hullo"
tkns = enc_encode(txt)
print(txt)
print(tkns)
print(*zip(list(txt), tkns))

Hullo hullo
[39, 84, 75, 75, 78, 220, 71, 84, 75, 75, 78]
('H', 39) ('u', 84) ('l', 75) ('l', 75) ('o', 78) (' ', 220) ('h', 71) ('u', 84) ('l', 75) ('l', 75) ('o', 78)


As a reminder, our decoder and byte decoder:

In [229]:
print(list(enc_self_decoder.items())[:10])
print(list(enc_self_byte_decoder.items())[:10])

[(0, '!'), (1, '"'), (2, '#'), (3, '$'), (4, '%'), (5, '&'), (6, "'"), (7, '('), (8, ')'), (9, '*')]
[('!', 33), ('"', 34), ('#', 35), ('$', 36), ('%', 37), ('&', 38), ("'", 39), ('(', 40), (')', 41), ('*', 42)]


In [225]:
print([enc_self_decoder[tkn] for tkn in tkns])

['H', 'u', 'l', 'l', 'o', 'Ġ', 'h', 'u', 'l', 'l', 'o']


In [234]:
tkstr = ''.join([enc_self_decoder[tkn] for tkn in tkns])
print(bytearray([enc_self_byte_decoder[c] for c in tkstr]))
print(bytearray([enc_self_byte_decoder[c] for c in tkstr]).decode('utf-8', errors='replace'))

bytearray(b'Hullo hullo')
Hullo hullo


In [316]:
enc_decode(tkns)

'Hullo hullo'

---

### The final wrap-up: loading the full encoder

In [17]:
enc117 = get_encoder('117M')

In [343]:
nprint(list(enc117.encoder.items())[:50])
nprint(list(enc117.encoder.items())[-50:])
nprint(list(enc117.decoder.items())[:50])
nprint(list(enc117.decoder.items())[-50:])

[('!', 0), ('"', 1), ('#', 2), ('$', 3), ('%', 4), ('&', 5), ("'", 6), ('(', 7), (')', 8), ('*', 9), ('+', 10), (',', 11), ('-', 12), ('.', 13), ('/', 14), ('0', 15), ('1', 16), ('2', 17), ('3', 18), ('4', 19), ('5', 20), ('6', 21), ('7', 22), ('8', 23), ('9', 24), (':', 25), (';', 26), ('<', 27), ('=', 28), ('>', 29), ('?', 30), ('@', 31), ('A', 32), ('B', 33), ('C', 34), ('D', 35), ('E', 36), ('F', 37), ('G', 38), ('H', 39), ('I', 40), ('J', 41), ('K', 42), ('L', 43), ('M', 44), ('N', 45), ('O', 46), ('P', 47), ('Q', 48), ('R', 49)]

-----------------
[('Ġkernels', 50207), ('ĠFranÃ§ois', 50208), ('ĠDuff', 50209), ('ĠPon', 50210), ('ĠLeica', 50211), ('ĠGarmin', 50212), ('Ġorphans', 50213), ('ĠClaudia', 50214), ('Ġcalendars', 50215), ('ĠLeilan', 50216), ('ento', 50217), ('Rocket', 50218), ('Ġbrunch', 50219), ('ĠHawking', 50220), ('ainers', 50221), ('Ġsensibilities', 50222), ('ĠkW', 50223), ('ĠKand', 50224), ('Ġreclaimed', 50225), ('Ġinterestingly', 50226), ('×©', 50227), ('romy', 50228

In [344]:
nprint(list(enc117.byte_encoder.items())[:50])
nprint(list(enc117.byte_encoder.items())[-50:])
nprint(list(enc117.byte_decoder.items())[:50])
nprint(list(enc117.byte_decoder.items())[-50:])

[(33, '!'), (34, '"'), (35, '#'), (36, '$'), (37, '%'), (38, '&'), (39, "'"), (40, '('), (41, ')'), (42, '*'), (43, '+'), (44, ','), (45, '-'), (46, '.'), (47, '/'), (48, '0'), (49, '1'), (50, '2'), (51, '3'), (52, '4'), (53, '5'), (54, '6'), (55, '7'), (56, '8'), (57, '9'), (58, ':'), (59, ';'), (60, '<'), (61, '='), (62, '>'), (63, '?'), (64, '@'), (65, 'A'), (66, 'B'), (67, 'C'), (68, 'D'), (69, 'E'), (70, 'F'), (71, 'G'), (72, 'H'), (73, 'I'), (74, 'J'), (75, 'K'), (76, 'L'), (77, 'M'), (78, 'N'), (79, 'O'), (80, 'P'), (81, 'Q'), (82, 'R')]

-----------------
[(18, 'Ē'), (19, 'ē'), (20, 'Ĕ'), (21, 'ĕ'), (22, 'Ė'), (23, 'ė'), (24, 'Ę'), (25, 'ę'), (26, 'Ě'), (27, 'ě'), (28, 'Ĝ'), (29, 'ĝ'), (30, 'Ğ'), (31, 'ğ'), (32, 'Ġ'), (127, 'ġ'), (128, 'Ģ'), (129, 'ģ'), (130, 'Ĥ'), (131, 'ĥ'), (132, 'Ħ'), (133, 'ħ'), (134, 'Ĩ'), (135, 'ĩ'), (136, 'Ī'), (137, 'ī'), (138, 'Ĭ'), (139, 'ĭ'), (140, 'Į'), (141, 'į'), (142, 'İ'), (143, 'ı'), (144, 'Ĳ'), (145, 'ĳ'), (146, 'Ĵ'), (147, 'ĵ'), (148, 'Ķ'), 

In [345]:
nprint(list(enc117.bpe_ranks.items())[:50])
nprint(list(enc117.bpe_ranks.items())[-50:])

[(('Ġ', 't'), 0), (('Ġ', 'a'), 1), (('h', 'e'), 2), (('i', 'n'), 3), (('r', 'e'), 4), (('o', 'n'), 5), (('Ġt', 'he'), 6), (('e', 'r'), 7), (('Ġ', 's'), 8), (('a', 't'), 9), (('Ġ', 'w'), 10), (('Ġ', 'o'), 11), (('e', 'n'), 12), (('Ġ', 'c'), 13), (('i', 't'), 14), (('i', 's'), 15), (('a', 'n'), 16), (('o', 'r'), 17), (('e', 's'), 18), (('Ġ', 'b'), 19), (('e', 'd'), 20), (('Ġ', 'f'), 21), (('in', 'g'), 22), (('Ġ', 'p'), 23), (('o', 'u'), 24), (('Ġa', 'n'), 25), (('a', 'l'), 26), (('a', 'r'), 27), (('Ġt', 'o'), 28), (('Ġ', 'm'), 29), (('Ġo', 'f'), 30), (('Ġ', 'in'), 31), (('Ġ', 'd'), 32), (('Ġ', 'h'), 33), (('Ġan', 'd'), 34), (('i', 'c'), 35), (('a', 's'), 36), (('l', 'e'), 37), (('Ġt', 'h'), 38), (('i', 'on'), 39), (('o', 'm'), 40), (('l', 'l'), 41), (('en', 't'), 42), (('Ġ', 'n'), 43), (('Ġ', 'l'), 44), (('s', 't'), 45), (('Ġ', 're'), 46), (('v', 'e'), 47), (('Ġ', 'e'), 48), (('r', 'o'), 49)]

-----------------
[(('ĠCan', 'ary'), 49950), (('Ġk', 'ernels'), 49951), (('ĠFranÃ§', 'ois'), 49

In [338]:
text = "In probability theory and statistics, the Jensen–Shannon divergence is a method of measuring the similarity between two probability distributions. It is also known as information radius (IRad)[1] or total divergence to the average.[2] It is based on the Kullback–Leibler divergence, with some notable (and useful) differences, including that it is symmetric and it always has a finite value. The square root of the Jensen–Shannon divergence is a metric often referred to as Jensen-Shannon distance.[3][4][5]"
tok117 = enc117.encode(text)
print(tok117)
print()
txt117 = enc117.decode(tok117)
print(txt117)

[818, 12867, 4583, 290, 7869, 11, 262, 32623, 1906, 2484, 8825, 43366, 318, 257, 2446, 286, 15964, 262, 26789, 1022, 734, 12867, 24570, 13, 632, 318, 635, 1900, 355, 1321, 16874, 357, 4663, 324, 38381, 16, 60, 393, 2472, 43366, 284, 262, 2811, 3693, 17, 60, 632, 318, 1912, 319, 262, 509, 724, 1891, 1906, 3123, 571, 1754, 43366, 11, 351, 617, 12411, 357, 392, 4465, 8, 5400, 11, 1390, 326, 340, 318, 23606, 19482, 290, 340, 1464, 468, 257, 27454, 1988, 13, 383, 6616, 6808, 286, 262, 32623, 1906, 2484, 8825, 43366, 318, 257, 18663, 1690, 6412, 284, 355, 32623, 12, 2484, 8825, 5253, 3693, 18, 7131, 19, 7131, 20, 60]

In probability theory and statistics, the Jensen–Shannon divergence is a method of measuring the similarity between two probability distributions. It is also known as information radius (IRad)[1] or total divergence to the average.[2] It is based on the Kullback–Leibler divergence, with some notable (and useful) differences, including that it is symmetric and it always has a fi

In [269]:
print(list(enc117.bpe_ranks.items())[:10])

[(('Ġ', 't'), 0), (('Ġ', 'a'), 1), (('h', 'e'), 2), (('i', 'n'), 3), (('r', 'e'), 4), (('o', 'n'), 5), (('Ġt', 'he'), 6), (('e', 'r'), 7), (('Ġ', 's'), 8), (('a', 't'), 9)]
