# Data Sampling with a Sliding Window with Numeric Data

In [1]:
from importlib.metadata import version
import torch
print(f"PyTorch version: {version('torch')}")

PyTorch version: 2.9.1


Prepare the numeric data from 0 to 1000:

In [2]:
with open("number-data.txt", "w", encoding="utf-8") as f:
    for number in range(1001):
        f.write(f"{number} ")

Next, we will build a simple dataset class. Instead of using a tokenizer, we parse the integers directly from the text file:

In [3]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Modified to read integers directly
        # token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        token_ids = [int(i) for i in txt.strip().split()]

        # Use a sliding window to chunk the book into overlapping sequences of `max_length` tokens
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [4]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    # tokenizer = tiktoken.get_encoding("gpt2")
    tokenizer = None  # Not needed for numeric data

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

    return dataloader

In [5]:
with open("number-data.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [12]:
# Test the dataloader with a batch size of 1 and context size of 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,
    max_length=4,
    stride=1,
    shuffle=False,
)

data_iter = iter(dataloader)

first_batch = next(data_iter)
print("First batch:", first_batch)

First batch: [tensor([[0, 1, 2, 3]]), tensor([[1, 2, 3, 4]])]


In [13]:
second_batch = next(data_iter)
print("Second batch:", second_batch)

Second batch: [tensor([[1, 2, 3, 4]]), tensor([[2, 3, 4, 5]])]


In [14]:
third_batch = next(data_iter)
print("Third batch:", third_batch)

Third batch: [tensor([[2, 3, 4, 5]]), tensor([[3, 4, 5, 6]])]


In [15]:
for batch in dataloader:
    pass

last_batch = batch
print("Last batch:", last_batch)

Last batch: [tensor([[996, 997, 998, 999]]), tensor([[ 997,  998,  999, 1000]])]


We can also change the batch size and context size with stride:

In [19]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=2,
    max_length=4,
    stride=4,
    shuffle=False
)

for inputs, targets in dataloader:
    pass

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[992, 993, 994, 995],
        [996, 997, 998, 999]])

Targets:
 tensor([[ 993,  994,  995,  996],
        [ 997,  998,  999, 1000]])


In [20]:
# Set `shuffle=True` to shuffle the data
torch.manual_seed(0)
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=4, stride=4, shuffle=True)

for inputs, targets in dataloader:
    pass

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[224, 225, 226, 227],
        [520, 521, 522, 523]])

Targets:
 tensor([[225, 226, 227, 228],
        [521, 522, 523, 524]])
