<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

In [2]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\30616\anaconda3\envs\LLMs\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\30616\anaconda3\envs\LLMs\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\30616\anaconda3\envs\LLMs\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\30616\anaconda3\envs\LLMs\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  Fil

In [6]:
test_x = torch.Tensor([[[1, 2, 3, 4], [5, 6, 7, 8]]])
print(test_x.shape)

print(token_embedding_layer.weight.shape)
print(pos_embedding_layer.weight.shape)
test_o = token_embedding_layer(test_x.long())
test_p = pos_embedding_layer(test_x.long())
print(test_o.shape)
print(test_p.shape)

torch.Size([1, 2, 4])
torch.Size([50257, 256])
torch.Size([1024, 256])
torch.Size([1, 2, 4, 256])
torch.Size([1, 2, 4, 256])


In [7]:
print(encoded_text)

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13079, 410, 12523, 286, 

In [6]:
for input_ids, target_ids in dataloader:
    print("Input shape:", input_ids.shape)
    print(f"input data: {input_ids}")
    print("Target shape:", target_ids.shape)
    print(f"target data: {target_ids}")
    print(len(dataloader))
    break

Input shape: torch.Size([8, 4])
input data: tensor([[  691,   262,  9074,    13],
        [ 1021,    11,   618,   339],
        [ 1310,    25,   366,  5297],
        [17548,    11,   290,   665],
        [15632,   438,  2016,   257],
        [  326,  1785,   326,   262],
        [  683,  1969,  2157,   502],
        [   13,   764,   764,   764]])
Target shape: torch.Size([8, 4])
target data: tensor([[  262,  9074,    13,   536],
        [   11,   618,   339,   373],
        [   25,   366,  5297,    11],
        [   11,   290,   665, 24297],
        [  438,  2016,   257,   922],
        [ 1785,   326,   262,  4808],
        [ 1969,  2157,   502,    11],
        [  764,   764,   764,   314]])
160


In [7]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [8]:
print(input_embeddings.shape)
print(input_embeddings[0, 0, :])

torch.Size([8, 4, 256])
tensor([-2.9424e-01,  8.5753e-01,  9.5744e-01,  6.1082e-01,  1.1952e+00,
        -2.0120e+00, -1.3035e+00, -7.2532e-01, -2.0357e+00, -1.8518e-01,
        -4.0570e-01,  6.6317e-01, -2.0013e+00, -5.3268e-01,  3.4946e-01,
        -6.0442e-01,  3.2989e+00, -4.9880e-01,  4.3604e+00,  1.7857e-01,
        -2.4091e+00,  2.0193e+00,  7.1164e-01,  3.2518e-01, -1.0203e-01,
         2.3394e+00,  1.7746e+00,  1.9986e+00,  1.2190e+00,  2.0944e+00,
         4.1957e-01, -3.9419e-01,  4.8193e-01, -2.9612e-02,  1.7715e+00,
        -4.1220e-01, -1.1194e+00, -2.7399e+00,  6.0123e-01,  2.2393e+00,
        -3.9785e-01,  5.9969e-01, -9.3779e-01, -1.8952e+00,  1.6506e+00,
        -1.0535e+00, -7.4525e-01, -3.2800e+00,  1.9876e+00,  6.6153e-01,
        -6.9584e-01, -1.8579e-01,  1.7159e-01,  7.3297e-02,  4.8050e-01,
         1.3998e-01,  3.4670e+00, -4.9924e-01, -1.3674e+00, -4.5125e-01,
         2.9003e+00,  2.1064e+00,  7.8772e-01, -1.0797e+00, -2.5671e+00,
        -1.0506e-01,  2.012