# Build your own GPT

### 1. Asses Compute 

In [5]:
import torch, platform

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("CPU:", platform.processor())
print("Platform:", platform.platform())


Torch: 2.10.0+cpu
CUDA available: False
CPU: Intel64 Family 6 Model 78 Stepping 3, GenuineIntel
Platform: Windows-10-10.0.19045-SP0


### 2. Choose and Download Dataset (Project Gutenberg)

In [6]:
from urllib import request

BOOK_URLS = [
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt",  # Moby Dick; Or, The Whale
    "https://www.gutenberg.org/cache/epub/11/pg11.txt",    # Alice's Adventures in Wonderland
]

def download_text(url: str) -> str:
    raw = request.urlopen(url).read()
    text = raw.decode("utf-8", errors="replace")
    start = text.find("*** START OF")
    end = text.find("*** END OF")
    if start != -1 and end != -1:
        text = text[start:end]

    return text

text = "\n\n".join(download_text(u) for u in BOOK_URLS)

print("Dataset characters:", len(text))
print(text[:500])


Dataset characters: 1389023
*** START OF THE PROJECT GUTENBERG EBOOK MOBY DICK; OR, THE WHALE ***




MOBY-DICK;

or, THE WHALE.

By Herman Melville



CONTENTS

ETYMOLOGY.

EXTRACTS (Supplied by a Sub-Sub-Librarian).

CHAPTER 1. Loomings.

CHAPTER 2. The Carpet-Bag.

CHAPTER 3. The Spouter-Inn.

CHAPTER 4. The Counterpane.

CHAPTER 5. Breakfast.

CHAPTER 6. The Street.

CHAPTER 7. The Chapel.

CHAPTER 8. The Pulpit.

CHAPTER 9. The Sermon.

CHAPTER 10. A Bosom Friend.

CHAPTER 11. Ni


### 3. Preprocess: vocabulary + encode/decode

In [7]:
# Build vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size:", vocab_size)

# Create mappings
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# Encode/decode functions
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

# Quick sanity test
test_str = "the princess smiled"
print(encode(test_str)[:40])
print(decode(encode(test_str)))


Vocab size: 103
[74, 62, 59, 2, 70, 72, 63, 68, 57, 59, 73, 73, 2, 73, 67, 63, 66, 59, 58]
the princess smiled


### 4. Tokenize entire dataset + train/val split

In [8]:
import torch

# Convert full text to integer tokens
data = torch.tensor(encode(text), dtype=torch.long)
print("Total tokens:", data.numel())

# Split train/val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print("Train tokens:", train_data.numel())
print("Val tokens:", val_data.numel())


Total tokens: 1389023
Train tokens: 1250120
Val tokens: 138903


### 5. Batching

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define initial hyperparameters 
batch_size = 16
block_size = 32

def get_batch(split: str):
    data_split = train_data if split == "train" else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Sanity check
xb, yb = get_batch("train")
print("x batch shape:", xb.shape)
print("y batch shape:", yb.shape)


x batch shape: torch.Size([16, 32])
y batch shape: torch.Size([16, 32])
