In [None]:
with open("the_verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
import tiktoken

Loading gpt2 tokenizer

## Creating input output pairs

we're implemeting data loaders using sliding window approach

Data loaders are efficient and structured way of using datasets
we're using pytorch's inbubilt stuff

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i + max_length]))
            self.target_ids.append(torch.tensor(token_ids[i + 1: i + max_length + 1]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
dataloader_2 = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [None]:
data_iter_2 = iter(dataloader_2)

In [None]:
next(data_iter_2)

[tensor([[10970, 33310,    35, 18379],
         [  198, 15749, 40417,   628],
         [  198,    40,   550,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  198, 11274,  5891,  1576],
         [  438,   568,   340,   373]]),
 tensor([[33310,    35, 18379,   198],
         [15749, 40417,   628,   198],
         [   40,   550,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   198],
         [11274,  5891,  1576,   438],
         [  568,   340,   373,   645]])]

Lets trynna make token embeddings now

Lets take a simplest form of example with just vocab of size 6 and vector embedding of size 3

In [None]:
inputs = torch.tensor([2, 3, 5, 1])

In [None]:
vocab_size = 6
output_dim = 3

embed_1 = torch.nn.Embedding(vocab_size, output_dim)

Randomly initialized embedding layer weights

In [None]:
embed_1.weight

Parameter containing:
tensor([[ 0.6590,  1.8272,  0.9967],
        [ 0.7668,  0.0812, -0.2805],
        [ 0.3666, -0.4789, -0.9839],
        [ 0.9342,  1.6583, -0.8979],
        [ 0.0170, -1.2282,  0.9828],
        [-1.0764, -0.1559,  1.5065]], requires_grad=True)

Basically when we train model. The weights of this particular embedding layer that is in the start PLUS the weights of the actual neurons in the neural netowrk are tuned and hence used later on in predicting next word.

In [None]:
print(embed_1(inputs))

tensor([[ 0.3666, -0.4789, -0.9839],
        [ 0.9342,  1.6583, -0.8979],
        [-1.0764, -0.1559,  1.5065],
        [ 0.7668,  0.0812, -0.2805]], grad_fn=<EmbeddingBackward0>)


Positional encodings

In [None]:
vocab_size = 50257
output_dim = 256

embed = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)
data_itr = iter(dataloader)

In [None]:
inputs, targets = next(data_itr)

In [None]:
token_embedding = embed(inputs)

In [None]:
token_embedding.shape

torch.Size([8, 4, 256])

Now we add positional embedding

- We have context_length of 4 as we JUST want the positional embeddings for positions 0,1,2,3 and that's why we have context_length of 4 only rather than being of 50257.

- Also, the what we are going to do is basically add this positional embeddings to the inputs embeddings and have our final input ready for torch to have.

In [None]:
context_length = max_length
embed_pos = torch.nn.Embedding(context_length, output_dim)

In [None]:
pos_embeddings = embed_pos(torch.arange(max_length))

In [None]:
pos_embeddings

tensor([[ 0.4138,  0.1417, -1.0336,  ...,  1.0254,  1.0297,  0.3817],
        [ 0.5235, -0.4544, -0.3201,  ...,  0.8458, -2.0020,  0.8202],
        [ 0.2843, -0.6244, -1.6005,  ...,  0.2893,  1.2007,  1.5052],
        [ 1.1805, -1.9126,  0.9538,  ..., -1.1906, -1.4974,  0.0035]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
inputs_embeddings = pos_embeddings + token_embedding

In [None]:
inputs_embeddings.shape

torch.Size([8, 4, 256])