In [1]:
with open("data/the-verdict.txt", "r") as f:
    raw = f.read()

In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
txt = tokenizer.encode(raw)
len(txt)

5145

In [4]:
sample_txt = txt[:50]
len(sample_txt)

50

In [5]:
context_size = 5

x = sample_txt[:context_size]
y = sample_txt[1:context_size+1]

In [6]:
print(f"x : {x}")
print(f"y :      {y}")

x : [40, 367, 2885, 1464, 1807]
y :      [367, 2885, 1464, 1807, 3619]


In [7]:
for i in range(1, context_size+1):
    x_i = sample_txt[:i]
    y_i = sample_txt[i]
    
    print(x_i, "---", y_i)

[40] --- 367
[40, 367] --- 2885
[40, 367, 2885] --- 1464
[40, 367, 2885, 1464] --- 1807
[40, 367, 2885, 1464, 1807] --- 3619


In [8]:
for i in range(1, context_size+1):
    x_i = sample_txt[:i]

    y_i = sample_txt[i]
    
    print(tokenizer.decode(x_i), "---", tokenizer.decode([y_i]))

I ---  H
I H --- AD
I HAD ---  always
I HAD always ---  thought
I HAD always thought ---  Jack


In [10]:
! pip install torch



In [9]:
from torch.utils.data import Dataset, DataLoader

In [10]:
class GPTV1(Dataset):
    def __init__(self, txt, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"|endoftext|"})

        for i in range(0, len(token_ids) - max_len, stride):
            ip_chunk = token_ids[i : i + max_len]
            target_chunk = token_ids[i + 1 : i + max_len + 1]
            self.input_ids.append(torch.tensor(ip_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_len=256, stride=128,
                     shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    dataset = GPTV1(txt, tokenizer, max_len, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [23]:
dataloader = create_dataloader_v1(
    raw, batch_size=1, max_len=4, stride=1, shuffle=True
)
dataloader.dataset.target_ids

[tensor([ 367, 2885, 1464, 1807]),
 tensor([2885, 1464, 1807, 3619]),
 tensor([1464, 1807, 3619,  402]),
 tensor([1807, 3619,  402,  271]),
 tensor([ 3619,   402,   271, 10899]),
 tensor([  402,   271, 10899,  2138]),
 tensor([  271, 10899,  2138,   257]),
 tensor([10899,  2138,   257,  7026]),
 tensor([ 2138,   257,  7026, 15632]),
 tensor([  257,  7026, 15632,   438]),
 tensor([ 7026, 15632,   438,  2016]),
 tensor([15632,   438,  2016,   257]),
 tensor([ 438, 2016,  257,  922]),
 tensor([2016,  257,  922, 5891]),
 tensor([ 257,  922, 5891, 1576]),
 tensor([ 922, 5891, 1576,  438]),
 tensor([5891, 1576,  438,  568]),
 tensor([1576,  438,  568,  340]),
 tensor([438, 568, 340, 373]),
 tensor([568, 340, 373, 645]),
 tensor([ 340,  373,  645, 1049]),
 tensor([ 373,  645, 1049, 5975]),
 tensor([ 645, 1049, 5975,  284]),
 tensor([1049, 5975,  284,  502]),
 tensor([5975,  284,  502,  284]),
 tensor([ 284,  502,  284, 3285]),
 tensor([ 502,  284, 3285,  326]),
 tensor([ 284, 3285,  326,   11

In [36]:
tokenizer.decode([dataloader.dataset.input_ids[0][2]])

'AD'

In [23]:
raw[:50]

'I HAD always thought Jack Gisburn rather a cheap g'

In [14]:
import torch
torch.__version__

'2.9.0+cu128'

In [25]:
dataloader = create_dataloader_v1(
    raw, batch_size=1, max_len=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
next(data_iter)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]

In [26]:
next(data_iter)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]

In [27]:
next(data_iter)

[tensor([[2885, 1464, 1807, 3619]]), tensor([[1464, 1807, 3619,  402]])]

In [28]:
dataloader = create_dataloader_v1(
    raw, batch_size=8, max_len=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
