In [None]:
from importlib.metadata import version
print(version('torch'))

In [2]:
import os
import urllib.request

if not os.path.exists('the-verdict.txt'):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f"File len: {len(raw_text)}")
print(raw_text[:99])

In [None]:
import re

f = lambda text: re.sub(r'\s+([,.:;])', r'\1', text)
f('hello , world')

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
a = tokenizer.encode('How are you?<|endoftext|>', allowed_special={"<|endoftext|>"}); a

In [None]:
# len(txt) = 5
# max_length = 3
# stride = 1
# [0, 3], [1, 4]
a[1: 4], a[2:5]

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(txt) - max_length, stride):
            input = token_ids[i: i + max_length]
            target = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input))
            self.target_ids.append(torch.tensor(target))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [10]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [None]:
embedding = torch.nn.Embedding(4, 5)
embedding.weight.shape