In [1]:
import torch

In [2]:
#we will be using a relativly large vocab size, that of GPT 2
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [3]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):

    #initialization function
    def __init__(self, txt, tokenizer, max_length, stride): #max length is the conntext size
        self.input_ids = []
        self.target_ids = []

        #tokenize the entrie text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #use a slidding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride): #stride is the step, the context size is subtracted because we ddon't want to loop more than the length of the data.
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1 : i + max_length+1]
            self.input_ids.append(torch.tensor(input_chunk)) #convert the token ids to tensors and append it to the inpu_ids
            self.target_ids.append(torch.tensor(target_chunk)) #convert the target token ids to tensors and append it to the target ids

    def __len__(self): #this function gets the length of the input token ids
        return len(self.input_ids)
    
    def __getitem__(self, idx):#it returns the input and target row based on the index provided
        return self.input_ids[idx], self.target_ids[idx]

In [5]:
import tiktoken

In [6]:
#data loader function
def create_dataloader_v1(txt, batch_size=4, max_length=256,  #when drop last is true it drops the last batch if the size is smaller than the specified length
                         stride=128, shuffle=True, drop_last=True, #batch size the the number of CPU processors we intend to run parallely
                         num_workers=0): #stride is like the step, the amount to skip before the next batch, num_workers is the number of CPU heads that can be run simultaneously
    #initiatize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')

    #create the dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #create the dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [7]:
#import the data and read it into a variable avaliable in the workspace
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [8]:
#getting input from the data loader, a data loader is a matrix that has the bacth size as the number of rows and the context size as the no of cols, it provides the input text for the model and it represnts the amount of words the model can process at a time or will process at a time a time
max_length = 4 #context size
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [10]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [12]:
#generate the token embeddings for each of the input
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


batch size, context size and vector dimension

In [13]:
#creating another embedding layer for the positional embedding
#the size of the postional embedding would be the context size(n_rows) and the vector dimension(n_cols)
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
#recall that we are using absolute postional encoding
#torch.arange indexes the input list and assifns positions to them, it has zero indexing
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [15]:
#now we can add the positional embedding to the token embeddings
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
