<a href="https://colab.research.google.com/github/itsarifworld/gpt_model/blob/main/LLM_Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**WORKING WITH THE TEXT DATA**

TOKENIZING TEXT

In [None]:
import os
import urllib.request
import requests

In [None]:
if not os.path.exists("the-verdict.txt"):
    url = (
        "https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt"
    )
    file_path = "the-verdict.txt"

    response = requests.get(url, timeout=30)
    response.raise_for_status()
    with open(file_path, "wb") as f:
        f.write(response.content)

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


TOKENIZING THE TEXT

In [None]:
import re

text = " Hello, It's Arif."
result = re.split(r'(\s)', text)

print(result)

['', ' ', 'Hello,', ' ', "It's", ' ', 'Arif.']


In [None]:
result = re.split(r'([,.]|\s)', text)
print(result)

['', ' ', 'Hello', ',', '', ' ', "It's", ' ', 'Arif', '.', '']


In [None]:
preprocessed = re.split(r'([,.]|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius--though', 'a', 'good', 'fellow', 'enough--so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"The', 'height', 'of', 'his', 'glory"--that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing--his', 'last', 'Chicago', 'sitter--deploring', 'his', 'unaccountable', 'abdication', '.', '"Of', 'course', "it's", 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'way", 'up;', 'but', 'I', "don't", 'think', 'of', 'that', ',', 'Mr', '.', 'Rickham--the', 'loss', 'to', 'Arrt', 'is', 'all', 'I', 'think', '

In [None]:
len(preprocessed)

4092

**CONVERTING THE TOKENS INTO TOKEN IDs**

In [None]:
# Removing the duplicates and sorting
all_words = sorted(set(preprocessed))
all_words

['"',
 '"Ah',
 '"Ah--I',
 '"Be',
 '"By',
 '"Come',
 '"Destroyed',
 '"Don\'t',
 '"Gisburns"',
 '"Grindles',
 '"Hang',
 '"Has',
 '"How',
 '"I',
 '"I\'d',
 '"If',
 '"It',
 '"It\'s',
 '"Jack',
 '"Money\'s',
 '"Moon-dancers"',
 '"Mr',
 '"Mrs',
 '"My',
 '"Never',
 '"Of',
 '"Oh',
 '"Once',
 '"Only',
 '"Or',
 '"That',
 '"The',
 '"Then',
 '"There',
 '"There:',
 '"This',
 '"We',
 '"Well',
 '"What',
 '"When',
 '"Why',
 '"Yes',
 '"Yes--quite',
 '"Yes--she\'s',
 '"You',
 '"deadening',
 '"dragged',
 '"effects";',
 '"interesting":',
 '"lift',
 '"obituary"',
 '"strongest',
 '"strongly"',
 '"sweetly"--and',
 "'",
 "'Are",
 "'It's",
 "'coming'",
 "'done'",
 "'subject",
 "'technique'",
 "'way",
 '(I',
 '(Though',
 ')',
 ',',
 '.',
 'A',
 'Among',
 'And',
 'Arrt',
 'As',
 'At',
 'Burlington',
 'But',
 'By',
 'Carlo',
 'Carlo;',
 'Chicago',
 'Claude',
 'Croft',
 'Croft)',
 'Devonshire',
 "Don't",
 'Dubarry_',
 'Emperors',
 'Florence',
 'For',
 'Gallery',
 'Gideon',
 'Gisburn',
 'Gisburn!',
 "Gisburn's",
 '

In [None]:
len(all_words)

1317

In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [None]:
class SimpleTokenizerV1 :
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,:.;?_!"()\']|--|\s)', text)

    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[item] for item in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])

    text = re.sub(r'\s+([,.?!"()\])', r'\1', text)
    return text

CONVERTING TOKENS INTO TOKEN IDs

In [None]:
vocab["our"]

880

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1317


In [None]:
vocab = { token : integer for integer, token in enumerate(all_words)}

In [None]:
for i in enumerate(vocab.items()):
    print(i)  # i is a tuple
    if i[0] >= 50:
        break

(0, ('"', 0))
(1, ('"Ah', 1))
(2, ('"Ah--I', 2))
(3, ('"Be', 3))
(4, ('"By', 4))
(5, ('"Come', 5))
(6, ('"Destroyed', 6))
(7, ('"Don\'t', 7))
(8, ('"Gisburns"', 8))
(9, ('"Grindles', 9))
(10, ('"Hang', 10))
(11, ('"Has', 11))
(12, ('"How', 12))
(13, ('"I', 13))
(14, ('"I\'d', 14))
(15, ('"If', 15))
(16, ('"It', 16))
(17, ('"It\'s', 17))
(18, ('"Jack', 18))
(19, ('"Money\'s', 19))
(20, ('"Moon-dancers"', 20))
(21, ('"Mr', 21))
(22, ('"Mrs', 22))
(23, ('"My', 23))
(24, ('"Never', 24))
(25, ('"Of', 25))
(26, ('"Oh', 26))
(27, ('"Once', 27))
(28, ('"Only', 28))
(29, ('"Or', 29))
(30, ('"That', 30))
(31, ('"The', 31))
(32, ('"Then', 32))
(33, ('"There', 33))
(34, ('"There:', 34))
(35, ('"This', 35))
(36, ('"We', 36))
(37, ('"Well', 37))
(38, ('"What', 38))
(39, ('"When', 39))
(40, ('"Why', 40))
(41, ('"Yes', 41))
(42, ('"Yes--quite', 42))
(43, ('"Yes--she\'s', 43))
(44, ('"You', 44))
(45, ('"deadening', 45))
(46, ('"dragged', 46))
(47, ('"effects";', 47))
(48, ('"interesting":', 48))
(49, (

In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab # ( Original Mappping : " Hey ": 0, It's : 1 )
    self.int_to_str = {i:s for s,i in vocab.items()} # ( Reverse Mapping : 0 : Hey, 1 : It's )


  def encode(self, text):
    preprocessed = re.split(r'([,:.;?_!"()\']|--|\s)', text)
#Split whenever you find: •	punctuation symbols ( , : . ; ? _ ! " ( ) ' )

    preprocessed = [
        items.strip() for items in preprocessed if items.strip()
    ]
    # 	•	removes extra spaces using .strip() •	removes empty strings using if item.strip()
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])

    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [None]:
tokenizer = SimpleTokenizerV1(vocab)
# Modified test text to use words present in the original 'the-verdict.txt' vocabulary
# and compatible with the existing tokenization rules used for vocab creation.
text = "I HAD, always thought."
ids = tokenizer.encode(text)
print(ids)

[107, 100, 65, 236, 1171, 66]


In [None]:
tokenizer.decode(ids)

'I HAD, always thought.'

ADDING SPECIAL CONTEXT TOKENS

In [None]:
# ERROR BECAUSE HELLO IS NOT CONTAINED IN VOCABULARY, WE NEED TO ADD "<|UNK|>"
tokenizer = SimpleTokenizerV1(vocab)

text = "Hello, do you like tea. Is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

In [None]:
# ADDING UNK AND ENDOFTEXT TOKEN IN VOCAB
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext>|", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [None]:
len(vocab.items())

1319

In [None]:
for i, items in enumerate(list(vocab.items())[-5:]):
  print(items)

('younger', 1314)
('your', 1315)
('yourself', 1316)
('<|endoftext>|', 1317)
('<|unk|>', 1318)


In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab # Correct: maps string token to integer ID
    self.int_to_str = {i:s for s,i in vocab.items()} # Correct: maps integer ID to string token

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = " Hello, It's Arif "
text2 = " Naam to suna hi hoga"

text = " <|endoftext|> ".join((text1,text2))
#text = text1 + text2
print(text)

 Hello, It's Arif  <|endoftext|>  Naam to suna hi hoga


In [None]:
tokenizer.encode(text)

[1318, 65, 112, 54, 1318, 1318, 1318, 1318, 1186, 1318, 1318, 1318]

In [None]:
tokenizer.decode(tokenizer.encode(text))

"<|unk|>, It' <|unk|> <|unk|> <|unk|> <|unk|> to <|unk|> <|unk|> <|unk|>"

BYTEPAIR ENCODING

In [None]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))


tiktoken version: 0.12.0


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


DATA SAMPLING WITH A SLIDING WINDOW

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a
