Chapter 2: Text processing
========================

In [2]:
from importlib.metadata import version

print("torch version:", version("torch"))

torch version: 2.3.0


In [3]:
# load the text
with open ('the-verdict.txt','r',encoding="utf-8") as f:
    raw_test = f.read()

In [4]:
import re

In [5]:
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [7]:
results = re.split(r'([,.]|\s)' ,text) 
#[,.] means any character that's either a comma , or a period .
# \s means any whitespace character (space, tab, newline)
# | means "OR"
# () are capturing parentheses — this means the matched character is also kept in the result (not just used to split).
print(results)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [8]:
results = [item for item in results if item.strip()]
print(results)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [9]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\\s)', text) # OR --
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is this', '--', 'a test', '?']


In [15]:
# apply to raw_test
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_test)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [16]:
all_words = set(preprocessed)
vocab_size = len(all_words)
print("Vocabulary size:", vocab_size)

Vocabulary size: 1130


In [19]:
vocab = {token:integer for integer,token in enumerate(all_words)}  # This kind of dictionary is super useful in NLP
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('discovery', 0)
('straight', 1)
('wish', 2)
('reason', 3)
('shade', 4)
('reassurance', 5)
('foreseen', 6)
('underlay', 7)
('central', 8)
('are', 9)
('no', 10)
('admirers', 11)
('just', 12)
('protest', 13)
('At', 14)
('begun', 15)
('sign', 16)
('my', 17)
('naive', 18)
('donkey', 19)
('amazement', 20)
('patient', 21)
('business', 22)
('language', 23)
('arm-chairs', 24)
('thither', 25)
('somebody', 26)
('from', 27)
('morbidly', 28)
('big', 29)
('straw', 30)
('art', 31)
('dozen', 32)
('me', 33)
('answered', 34)
('substantial', 35)
('set', 36)
('disguised', 37)
('Sevres', 38)
('tribute', 39)
('untouched', 40)
('led', 41)
('painting', 42)
('wondered', 43)
('Professional', 44)
('anything', 45)
('disdain', 46)
('attack', 47)
('everlasting', 48)
('rich', 49)
('To', 50)


In [28]:
if "Hashan ".strip():
    print("True")

True


In [30]:
if "\n ".strip():
    print("True")
else:
    print("False")

False


In [42]:
# Implementing Simple Text Tokenizer
# vocab is a dictionary

class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()  #if item.strip() check if the string is empty: False
        ]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #\s+: matches one or more whitespace characters (spaces, tabs, newlines).
        return text

In [34]:
# Example for the replace spaces before punctuations
text = "Hello , world ! This is ( cool ) ."
result = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
print(result)

Hello, world! This is( cool).


In [43]:
# from raw_test
# vocab is a dictionary defined in the beginning
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[68, 417, 938, 816, 152, 439, 298, 414, 160, 91, 52, 160, 68, 846, 333, 490, 1080, 215, 318, 723, 333]


In [44]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


Adding unknowns and end of text tokens

In [46]:
all_tokens = sorted(list(set(preprocessed)))

# add unk and end of tokens
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

# creating a new VOCAB! 
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))


1132


In [47]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


Tokenizer with unknowns

In [49]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed # replace unknown words with this tag
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [50]:
# test the new tokenizer
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

In [51]:
text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [52]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [53]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


Byte pair encoding (BPE)

In [55]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [56]:
tokenizer = tiktoken.get_encoding('gpt2')

Without allowed_special, it might error or tokenize <|endoftext|> as many subwords.

In [57]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

In [61]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [62]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


Data sampling with a sliding window

In [63]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [64]:
enc_text = tokenizer.encode(raw_text) #BPE Tokenizer used here
print(len(enc_text))

5145


In [65]:
# remove first 50
enc_sample = enc_text[50:]

In [66]:
# arr[start:stop] ---> This gives you a slice from start index up to (but not including) stop.

context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

In [67]:
print(f"x: {x}")
print(f"y: {y}")

x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [68]:
# next word predition tasks,

for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)


[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [69]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [71]:
# A dataset for batched inputs and targets
import torch
from torch.utils.data import Dataset, DataLoader


# Let's define the class

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
    
        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # range(start, stop, step)
        for i in range(0, len(token_ids) - max_length, stride): 
            # max_length slices the long token list into chunks --> batch processing because can't handle all tokens at once
            # stride ontrols the overlap between consecutive input chunks
            # We subtract max_length so that we don’t run out of bounds when slicing!

            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [73]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    # max_length - Each input and target chunk will be 256 tokens long
    # stride - Each new chunk will start 128 tokens after the previous one (50% overlap)

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

test the dataloader with a batch size of 1 for an LLM with a context size of 4:

In [74]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [75]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [76]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


Increase the batch size

In [78]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

# max length is context length
# stride is defined in the GPTDatasetV1 class where 
# it decide the distance between two input tokens

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


Token embeddings

In [79]:
vocab_size = 6
output_dim = 3

In [83]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # this is randomly instialized by pytorch: not trained

In [81]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [82]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)
