In [1]:
#import the data and read it into a variable avaliable in the workspace
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [2]:
#import the tokenizer and encode the raw text
import tiktoken

#instatiate the tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
#encode the raw text
encoded_text = tokenizer.encode(raw_text)

#print the length of the encoded text
print(len(encoded_text))

5145


In [6]:
#remove the first 50 tokes from the data
encoded_sample = encoded_text[:50]
print(len(encoded_sample))

50


In [None]:
#using the context size of 4 ie the length of input the model is designed to focus on at a time
context_size = 4
#this imples that the input x is the first four tokens [1,2,3,4] and the output y [2,3,4,5] is the nnext four tokens
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:  {y}")

x: [40, 367, 2885, 1464]
y: [367, 2885, 1464, 1807]


In [9]:
#generating the input target struture that allows next word prediction
for i in range(1, context_size+1):
    context = encoded_sample[:i] #input text
    desired = encoded_sample[i] #target text

    print(context, "----->", desired)

[40] -----> 367
[40, 367] -----> 2885
[40, 367, 2885] -----> 1464
[40, 367, 2885, 1464] -----> 1807


In [11]:
#generating the input target struture that allows next word prediction
for i in range(1, context_size+1):
    context = encoded_sample[:i] #input text
    desired = encoded_sample[i] #target text
    #printing the decoded output
    print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))

I ----->  H
I H -----> AD
I HAD ----->  always
I HAD always ----->  thought


Implementing a Data Loader

In [None]:
#creating the input-target tensors using data loaders
#import Dataset and Data Loaders
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):

    #initialization function
    def __init__(self, txt, tokenizer, max_length, stride): #max length is the conntext size
        self.input_ids = []
        self.target_ids = []

        #tokenize the entrie text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #use a slidding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride): #stride is the step, the context size is subtracted because we ddon't want to loop more than the length of the data.
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1 : i + max_length+1]
            self.input_ids.append(torch.tensor(input_chunk)) #convert the token ids to tensors and append it to the inpu_ids
            self.target_ids.append(torch.tensor(target_chunk)) #convert the target token ids to tensors and append it to the target ids

    def __len__(self): #this function gets the length of the input token ids
        return len(self.input_ids)
    
    def __getitem__(self, idx):#it returns the input and target row based on the index provided
        return self.input_ids[idx], self.target_ids[idx]

The GPTDatasetV1 class is based on PyTorch Dataset class. <br> It describes how individual rows are fetched from the dataset <br> Each row consists of token IDs based n the context size(max length) assigned to an input chunk tensor. <br> The target chunk tendor contains the corresponding targets.

In [14]:
#data loader function
def create_dataloader_v1(txt, batch_size=4, max_length=256,  #when drop last is true it drops the last batch if the size is smaller than the specified length
                         stride=128, shuffle=True, drop_last=True, #batch size the the number of CPU processors we intend to run parallely
                         num_workers=0): #stride is like the step, the amount to skip before the next batch, num_workers is the number of CPU heads that can be run simultaneously
    #initiatize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')

    #create the dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #create the dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
#create the data loader but with different parameters
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False #move stride means lesser computation becaue we move over the data faster.
)

#create an iterator for the data loader
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch) #produces the input tensor and the output tensor

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


The context size of 4 is typically small but tLLMs are typically trained with the context size fo 256

In [16]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
