In [1]:
from pathlib import Path 
from typing  import List 

In [2]:
path = '../the-verdict.txt'

In [3]:
with open(path,'r',encoding='utf-8') as f: 
    raw_text = f.read() 



In [4]:
print("Total number of characters:",len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [5]:
import re # regular expresion 

text = "Hello, world. This, is a test." 

result = re.split(r'(\s)',text)
""" 
The result is a list of individual words,whitespaces, and punctuation characters: 
"""
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [6]:
""" 
lets mdify the regular expression splits  on whitespaces(\s), commands and periods
([,.]) 
""" 
result = re.split(r'([,.]|\s)',text)

  lets mdify the regular expression splits  on whitespaces(\s), commands and periods


In [7]:
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [8]:
""" 
The resulting whitespace-free output looks like as follows 
""" 

result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [9]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [10]:
"""
Now that we have a basic tokenizer working, lets apply it to Edith Whartons entire 
short story 
""" 
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [11]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [12]:
all_words = sorted(set(preprocessed)) 
vocab_size = len(all_words)
print(vocab_size)

1130


In [13]:
vocab = {token: integer for integer, token in enumerate(all_words) }

In [14]:
for i , item in enumerate(vocab.items()): 
    print(item)
    if i >=50: 
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [15]:
""" 
Lets implement a complete tokenizer class in python with an encode method that 
splits text into tokens and carreis out the string-to-integer mapping to produce 
token ids via vocabulary. In addition, well imprlement decode method that carreis 
out the reverse integer-to-string mapping to convert the token ids back into text 
""" 

class SimpleTokenizerV1: 
    def __init__(self,vocab:dict) ->None: # vocab -> dict 
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()} 
    
    def encode(self,text) ->List: 
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ] # remove whitespace 
        ids = [self.str_to_int[s] for s in preprocessed] 
        return ids 

    def decode(self,ids): 
        text = "".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)# remove spaces before the 
        # specifed punctuaton 
        return text 



In [16]:
tokenizer = SimpleTokenizerV1(vocab) 


In [17]:
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [18]:
print(tokenizer.decode(ids))

"It'sthelasthepainted,youknow,"Mrs.Gisburnsaidwithpardonablepride.


In [19]:
# try to use the tokenizer on text not in the text 

text = "Hello, do you like tea?" 
print(tokenizer.encode(text))# we get error :(

KeyError: 'Hello'

In [20]:
""" 
Lets now modify the vocabulary to includde two new speical tokens <unk> and 
<|endoftext|> , by adding them to our list of all unique works: 
""" 

all_tokens = sorted(list(set(preprocessed)))

all_tokens.extend(["<|endoftext|>","<|unk|>"]) 

In [21]:
vocab = {token: integer for integer, token in enumerate(all_tokens)}

In [22]:
vocab['<|unk|>']

1131

In [23]:

print(len(vocab.items()))

1132


In [24]:

for i , item in enumerate(list(vocab.items())[-5:]): 
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [25]:
class SimpleTokenizerV2: 
    def __init__(self,vocab:dict) ->None: # vocab -> dict 
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()} 
    
    def encode(self,text) ->List: 
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ] # remove whitespace 
        preprocessed = [item if item in self.str_to_int else f"<|unk|>" for item in preprocessed]
        
        ids = [self.str_to_int[s] for s in preprocessed] 
        return ids 

    def decode(self,ids): 
        text = "".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)# remove spaces before the 
        # specifed punctuaton 
        return text 




In [26]:
text1 = "hello, do you like tea?" 
text2 = "In the sunlit terraces of the place." 

text = "<|endoftext|>".join((text1,text2))
text

'hello, do you like tea?<|endoftext|>In the sunlit terraces of the place.'

In [27]:

tokenizer = SimpleTokenizerV2(vocab) 

In [28]:
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1131, 988, 956, 984, 722, 988, 773, 7]


In [29]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>,doyouliketea?<|unk|>thesunlitterracesoftheplace.


Depending on the LLM, some researhers also consider additional speical tokens such as following: 

- [BOS] (begging of sequence)--This token marks the start of a text. it signifies to the llm where a piece of context begins 
- [EOS] (end of sequence)--This token is positioned at teh end of a text and is especialy useful when concatenationg multiple unrelated texts, similear to <|endoftext|>. For instace , when combining two different wikipedia articles or books, the [EOS] token indicates where once ends and the next begins . 
- [PAD] (padding)-when training LLMs with batch size larger than one, the batch might contain texts of varying lengths . To ensure all texts has the same lenght , the shorter texts are extended or `padded` using the [PAD] token , up to the lenght of the longest text in the batch. 


In [30]:
import tiktoken 

In [31]:
tokenizer = tiktoken.get_encoding("gpt2") 

In [32]:
text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [33]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [34]:
path = '../the-verdict.txt'

with open(path,"r",encoding="utf-8") as f: 
    raw_text = f.read() 

enc_text = tokenizer.encode(raw_text) 

print(len(enc_text))

5145


In [35]:
""" 
we will remove the first 50 tokens from the dataset for demostration purposes, 
aas it results in a sligtly more interesting text passage in the next steps 
""" 
enc_sample = enc_text[50:] 


In [36]:
""" 
One of the easierst and most intuitive ways to create the input-target pars for the 
text-word prediction task is to crate two variables ,x and y, where x contins the input 
tokens and y contains the targets, which are the inut shifted by 1: 
""" 

context_size = 4 # how many tokens are included in the input 

x = enc_sample[:context_size] 
y= enc_sample[1:context_size+1] 

print(f"x: {x}") 
print(f"y: {y}") 

x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [37]:
for i in range(1,context_size +1): 
    context = enc_sample[:i] 
    desired = enc_sample[i] 
    print(context,"----->",desired) 

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [38]:
for i in range(1,context_size +1): 
    context = enc_sample[:i] # upto i but not i 
    desired = enc_sample[i]
    print(tokenizer.decode(context),"----->",tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [40]:
""" 
Now lets imprement dataloader for this  in pytorch 
""" 
import torch 
from torch.utils.data import Dataset, DataLoader 

In [41]:
class GPTDatasetV1(Dataset): 
    def __init__(self,txt,tokenizer,max_length,stride): 
        self.input_ids = [] 
        self.target_ids = []

        token_ids = tokenizer.encode(txt) 

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self): 
        return len(self.input_ids) 
    
    def __getitem__(self,idx): 
        return self.input_ids[idx] , self.target_ids[idx] 
    


In [42]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                    stride=128, shuffle=True, drop_last=True,
                    num_workers=0):
                    
                    tokenizer  = tiktoken.get_encoding('gpt2') 
                    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride) 
                    dataloader = DataLoader(
                        dataset, 
                        batch_size=batch_size, 
                        shuffle=shuffle, 
                        drop_last=drop_last, 
                        num_workers=num_workers
                    )
                    return dataloader 

In [45]:
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=1, 
    max_length=4, 
    stride=1, 
    shuffle=False
)

In [46]:
data_iter = iter(dataloader) 

In [47]:
first_batch = next(data_iter) 
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [48]:
second_batch = next(data_iter) 
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [49]:
dataloader = create_dataloader_v1(
raw_text, batch_size=8, max_length=4, stride=4,
shuffle=False
)

In [50]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [60]:
# CREATING TOKEN EMBEDDINGS 

input_ids = torch.tensor([2,3,4,1]) 
vocab_size = 5 
output_dim = 3 
torch.manual_seed(123) 
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [61]:
input_ids

tensor([2, 3, 4, 1])

In [62]:
print(embedding_layer.weight)

Parameter containing:
tensor([[-0.1115,  0.1204, -0.3696],
        [-0.2404, -1.1969,  0.2093],
        [-0.9724, -0.7550,  0.3239],
        [-0.1085,  0.2103, -0.3908],
        [ 0.2350,  0.6653,  0.3528]], requires_grad=True)


In [63]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.1085,  0.2103, -0.3908]], grad_fn=<EmbeddingBackward0>)


In [64]:
print(embedding_layer(input_ids))

tensor([[-0.9724, -0.7550,  0.3239],
        [-0.1085,  0.2103, -0.3908],
        [ 0.2350,  0.6653,  0.3528],
        [-0.2404, -1.1969,  0.2093]], grad_fn=<EmbeddingBackward0>)


In [65]:
# Encoding word positons 
"""
to achieve this , we can use two braod categoreis of position-aware embeddings: 
relative positonal embeddings and absolute positona embeddings. absolute positonal 
embeddings are directly associated with speicifc positonas in a sequence. for each 
position in the input sequence, a unique embedding is added to the token embedding to 
convery its exact location . for instance , the first token will have a specific potion 
embedding, the second token another distict embedding, and so on  , Instead of focusing 
n the absolute position of a token, the empasis of relative positonal embeddings is on 
the relative position or distance between tokens. This means the mdoel learns the 
relationship in terms of how far apart rather than at which exact position . The advantage 
here is that model can genralize better to sequences of varying lenghts , even if 
it hast seen such lenghts during training , 
opensai gpt models use absolute postional embeddings that are optimized during the trainign 
process rather than the ixed or predefined like the positonal encding in the 
original transfer model. 
""" 
vocab_size = 50257 
output_dim  = 256 
token_embedding_layer  = torch.nn.Embedding(vocab_size,output_dim) 


In [66]:
max_length = 4 
dataloader = create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length, 
    stride=max_length,shuffle=False
)

In [67]:
data_iter = iter(dataloader) 
inputs,targets = next(data_iter) 

print("Token IDs:\n",inputs) 
print("\nInputs shape:\n",inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [68]:
token_embeddings = token_embedding_layer(inputs)

In [69]:
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [70]:
""" for a gpt model absolute embeeing approach, we just need to 
crate another embedding layer that has the same embedding dimenstion as the 
token_embedding_layer 
""" 
context_lenght = max_length 

pos_embedding_ayer = torch.nn.Embedding(context_lenght,output_dim) 
pos_embeddings = pos_embedding_ayer(torch.arange(context_lenght)) 
print(pos_embeddings.shape)

torch.Size([4, 256])


In [72]:
input_embeddings = token_embeddings + pos_embeddings

print(input_embeddings.shape)

torch.Size([8, 4, 256])
