### Some section

In [None]:
import re
import urllib.request

In [3]:
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/"
       "ch02/01_main-chapter-code/the-verdict.txt")

urllib.request.urlretrieve(url, "the-verdict.txt")

('the-verdict.txt', <http.client.HTTPMessage at 0x10479a3d0>)

In [4]:
with open('../the-verdict.txt', 'r', encoding='utf-8') as f:
    verdict = f.read()

In [5]:
len(verdict)

20479

In [6]:
verdict[:99]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

In [7]:
result = re.split(r'([,.!]|\s)', "Some example sentence. Thanks for joining us!")

[item for item in result if item.strip()]

['Some', 'example', 'sentence', '.', 'Thanks', 'for', 'joining', 'us', '!']

In [8]:
# Get a little more complex

tmp = "Hello, matey! Here is some text? I think? Is this thing -- on?!"

result = re.split(r'([,.:;!_"()?]|--|\s)', tmp)
result = [item for item in result if item.strip()]
result

['Hello',
 ',',
 'matey',
 '!',
 'Here',
 'is',
 'some',
 'text',
 '?',
 'I',
 'think',
 '?',
 'Is',
 'this',
 'thing',
 '--',
 'on',
 '?',
 '!']

In [9]:
# Lets apply this to the Edith Wharton text

preprocessed = re.split(r'([,.:;!_"()?\']|--|\s)', verdict)
preprocessed = [item for item in preprocessed if item.strip()]
len(preprocessed)

4690

In [10]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


### Converting tokens into numeric IDs

We have a bunch of tokens, but we need to convert them into unique IDs. Easy enough to do. Get the tokens and alphabetize them. Remove duplicates and gives each one a number.

In [11]:
all_words = sorted(set(preprocessed))
print(f"{len(all_words)} unique words in the text")

1130 unique words in the text


In [12]:
vocab = {word: idx for idx, word in enumerate(all_words)}

In [13]:
for word, idx in vocab.items():
    print(f"{word}: {idx}")
    if idx > 20:
        break

!: 0
": 1
': 2
(: 3
): 4
,: 5
--: 6
.: 7
:: 8
;: 9
?: 10
A: 11
Ah: 12
Among: 13
And: 14
Are: 15
Arrt: 16
As: 17
At: 18
Be: 19
Begin: 20
Burlington: 21


### Put all this logic into a class

In [14]:
from tokenizer import SimpleTokenizerV1

In [15]:
tokenize = SimpleTokenizerV1(vocab)

In [16]:
tokenize.encode("look at me")

[642, 180, 663]

In [17]:
tokenize.decode([459, 123, 888, 1050])

'forehead absurdity sign unusual'

In [18]:
tokenize.decode([1035, 56, 837, 554])

'true It resented hooded'

BUT we run into issues when a never before seen word shows up. We handle this by adding some additional handlers to the vocabulary. `|unk|` is used when we don't know a word (we can make sure the code is retrieved using the `.get` method for dicts). In addition, we can make sure we tell the model that an end of document has been reached using another one like `|end of text|`

In [19]:
all_words.extend(['<|endoftext|>', '<|unk|>'])

vocab = {word: idx for idx, word in enumerate(all_words)}

In [20]:
for item in enumerate(list(vocab.keys())[-5:]):
    print(item)

(0, 'younger')
(1, 'your')
(2, 'yourself')
(3, '<|endoftext|>')
(4, '<|unk|>')


In [21]:
from tokenizer import SimpleTokenizerV2

In [22]:
tokenize_v2 = SimpleTokenizerV2(vocab)

In [23]:
tokenize_v2.encode("BOOM POW")

[1131, 1131]

In [24]:
# Doing the end of text

text1 = "So here I am, it's in my head"

text2 = "Eating seeds is a pasttime activity"

text = " <|endoftext|> ".join([text1, text2])

In [25]:
text

"So here I am, it's in my head <|endoftext|> Eating seeds is a pasttime activity"

In [26]:
tokenize_v2.decode(tokenize_v2.encode(text))

"<|unk|> here I am, it' s in my head <|endoftext|> <|unk|> <|unk|> is a <|unk|> activity"

### Byte pair encoding (BPE)

apparently a complex algorithm, so we'll use it from another library.

In [1]:
import tiktoken

In [27]:
tiktokenize = tiktoken.get_encoding('gpt2')

In [31]:
text = ("Hello, do  you like tea? <|endoftext|> In the sunlit terraces of someUnknownPlace.")

integers = tiktokenize.encode(text, allowed_special={"<|endoftext|>"})

integers

[15496,
 11,
 466,
 220,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 617,
 20035,
 27271,
 13]

In [32]:
strings = tiktokenize.decode(integers)
strings

'Hello, do  you like tea? <|endoftext|> In the sunlit terraces of someUnknownPlace.'

Note that the nonsense word made it back - this is due to BPE algorithm that iteritively generates words.

### section 2.6 Data Sampling with a Sliding Window