# GPT-2

In [8]:
import sys
sys.path.append('..')

In [2]:
def cprint(things):
    print(*things, sep='\n')

In [18]:
import os
import json
import random
from src.encoder import Encoder

---

## Byte Pair Encoding Utilities



---
### Get pairs

Disseminate a word into the set of its ordered pairs.

In [3]:
def get_pairs(word):
    """
    Return set of symbol pairs in a word.
    Word argument is given as a tuple of symbols (symbols being variable-length strings).

    Thus, the word 'word' is represented as;
    {('o', 'r'), ('r', 'd'), ('w', 'o')}
    """
    pairs = set()
    
    prev_char = word[0] # init at first char
    
    # for each char, create a pair with the following one, then shift by one char
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
        
    return pairs

In [4]:
word = 'antidisestablishmentarianism'
cprint(get_pairs(word))

('t', 'a')
('a', 'n')
('h', 'm')
('e', 'n')
('s', 't')
('l', 'i')
('s', 'e')
('e', 's')
('i', 'd')
('n', 't')
('d', 'i')
('i', 's')
('a', 'b')
('t', 'i')
('n', 'i')
('a', 'r')
('s', 'h')
('b', 'l')
('m', 'e')
('i', 'a')
('s', 'm')
('r', 'i')


The function, however, will not take python strings directly after the first iteration, but a tuple of symbols (strings), those symbols being the byte-pair-encodings that the algorithm will have generated. 
Thus, instead of taking 'antidisestablishmentarianism', it will take, e.g. `('ant', 'idis', 'establishment', 'arian', 'ism')`, and look for pairs in that.

In [5]:
word = ('ant', 'idis', 'establishment', 'arian', 'ism')
cprint(get_pairs(word))

('arian', 'ism')
('ant', 'idis')
('idis', 'establishment')
('establishment', 'arian')


---
N.B: because this is a set, words that are composed of the same pairs become the same. More specifically, any *shift right or left* ([cyclic permutation](https://en.wikipedia.org/wiki/Cyclic_permutation)) of the letters gives the same result. Quite unproblematic for NLP, but still worth knowing. 

In [6]:
print(get_pairs('cabc'))
print(get_pairs('abca'))
print(get_pairs('bcab'))

{('c', 'a'), ('a', 'b'), ('b', 'c')}
{('c', 'a'), ('a', 'b'), ('b', 'c')}
{('c', 'a'), ('a', 'b'), ('b', 'c')}


---

### Get Encoder

Read from json, bpe & create Encoder object.

In [12]:
def get_encoder(model_name):
    # get the vocabulary as a json file 
    with open(os.path.join('../models', model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    # get the complete vocabulary as txt file
    with open(os.path.join('../models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()

    # translates a string format with x y on each line to [(x,y),...]
    # see `vocab.bpe` for reference
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
                                                                                # skip the first line
                                                                                # that has the version,
                                                                                # skip the last element
                                                                                # of split, which will
                                                                                # be empty
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

In [13]:
model_name = '117M'

---
## The encoder (json)

In [16]:
with open(os.path.join('../models', model_name, 'encoder.json'), 'r') as f:
        json117 = json.load(f)

In [19]:
print('Size of dict:', len(json117))
print('A random element:', random.choice(list(json117.items())))

Size of dict: 50257
A random element: ('ĠShields', 30512)


---
## The bpe_encoder

In [20]:
with open(os.path.join('../models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
    bpe117 = f.read().split('\n')

In [21]:
print('Length of string:', len(bpe117))
print()
cprint(bpe117[:10])
print()
cprint(bpe117[-10:])

Length of string: 50002

#version: 0.2
Ġ t
Ġ a
h e
i n
r e
o n
Ġt he
e r
Ġ s

Ġ( /
âĢ¦ ."
Com par
Ġampl ification
om inated
Ġreg ress
ĠColl ider
Ġinform ants
Ġg azed



## The bpe merges
(pairs of symbols, e.g. ('trans', 'mission'))

In [22]:
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe117[1:-1]]
print('First ten elements:')
cprint(bpe_merges[:10])
print()
print('Last ten elements:')
cprint(bpe_merges[-10:])

First ten elements:
('Ġ', 't')
('Ġ', 'a')
('h', 'e')
('i', 'n')
('r', 'e')
('o', 'n')
('Ġt', 'he')
('e', 'r')
('Ġ', 's')
('a', 't')

Last ten elements:
('Comm', 'ission')
('Ġ(', '/')
('âĢ¦', '."')
('Com', 'par')
('Ġampl', 'ification')
('om', 'inated')
('Ġreg', 'ress')
('ĠColl', 'ider')
('Ġinform', 'ants')
('Ġg', 'azed')


## N.B.: the Ġ

For the Ġ, which represents the space, see `bytes-to-unicode.ipynb`.