In [9]:
try :
  with open("data.txt" , "r") as file:
    data  = file.read()
except Exception as e :
  print(e)

In [10]:
data = data.replace("\n","")
data =data.replace("\\","")
data=data.strip()


In [12]:
try :
  import tiktoken
except Exception:
  !pip install tiktoken
  import tiktoken


In [14]:
tokenizer = tiktoken.encoding_for_model("gpt2")

In [18]:
token_ids  = tokenizer.encode(data)
print("Total tokens using BPT",len(token_ids))

Total tokens using BPT 5016


In [49]:
context_size = 10

print("Cumulative Input → Next Token Prediction\n" + "-"*40)

for i in range(1, context_size + 1):
    input_text = tokenizer.decode(token_ids[:i])
    output_text = tokenizer.decode([token_ids[i]])

    print(f"Input ({i} tokens): {input_text!r}  --->  Next token: {output_text!r}\n")


Cumulative Input → Next Token Prediction
----------------------------------------
Input (1 tokens): 'THE'  --->  Next token: ' VER'

Input (2 tokens): 'THE VER'  --->  Next token: 'D'

Input (3 tokens): 'THE VERD'  --->  Next token: 'ICT'

Input (4 tokens): 'THE VERDICT'  --->  Next token: 'June'

Input (5 tokens): 'THE VERDICTJune'  --->  Next token: ' 1908'

Input (6 tokens): 'THE VERDICTJune 1908'  --->  Next token: 'I'

Input (7 tokens): 'THE VERDICTJune 1908I'  --->  Next token: ' had'

Input (8 tokens): 'THE VERDICTJune 1908I had'  --->  Next token: ' always'

Input (9 tokens): 'THE VERDICTJune 1908I had always'  --->  Next token: ' thought'

Input (10 tokens): 'THE VERDICTJune 1908I had always thought'  --->  Next token: ' Jack'



# DATA LOADER

In [53]:
import torch
from torch.utils.data import Dataset , DataLoader

# What stride means

Stride = how many tokens you move forward in your dataset when making the next input-output pair.

It controls how much overlap there is between consecutive input sequences.

# 2️⃣ Example

Suppose:

- token_ids = [0,1,2,3,4,5,6,7,8,9]
- max_length = 4
- stride = 1

# With stride = 1:
- i=0: input  = [0,1,2,3], output = [1,2,3,4]
- i=1: input  = [1,2,3,4], output = [2,3,4,5]
- i=2: input  = [2,3,4,5], output = [3,4,5,6]


✅ Consecutive inputs overlap by 3 tokens (because stride=1, only move 1 token ahead).

# With stride = 4:
- i=0: input = [0,1,2,3], output = [1,2,3,4]
- i=4: input = [4,5,6,7], output = [5,6,7,8]


✅ No overlap between input sequences (stride = max_length).

# 3️⃣ Visual way to think
- Tokens:  0 1 2 3 4 5 6 7 8 9
- Window: [0 1 2 3]  -> stride=1 -> next window starts at 1
- Window:   [1 2 3 4]  -> stride=1 -> next window starts at 2
- Window:     [2 3 4 5] ...


Smaller stride → more overlap → more training examples → slower but better learning.

Larger stride → fewer examples → faster, less redundancy.

# 4️⃣ TL;DR

- Stride = how many tokens to skip forward for the next input-output pair.

- Stride = 1 → move 1 token → inputs highly overlap.

 - Stride = max_length → no overlap.

In [70]:
# Why i : i + max_length for input
# input_row_pair = token_ids[i : i + max_length]


# This takes max_length tokens starting at position i.

# Example:

# token_ids = [0,1,2,3,4,5,6,7,8]
# max_length = 4
# stride = 1

# i = 0  → token_ids[0:4] = [0,1,2,3]  (first input sequence)
# i = 1  → token_ids[1:5] = [1,2,3,4]  (next input sequence)
# ✅ So i shifts the window along the sequence.



In [71]:
from torch.utils.data import Dataset, DataLoader
import torch
import tiktoken

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_row_pair = token_ids[i : i + max_length]
            output_row_pair = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_row_pair))
            self.target_ids.append(torch.tensor(output_row_pair))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_data_loader(txt, batch_size, stride=128, max_length=256, drop_last=True, shuffle=True, num_workers=0):
    # Fixed: get_encoding() instead of encoding_name_for_model()
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer=tokenizer, max_length=max_length, stride=stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, drop_last=drop_last)
    return dataloader


In [86]:


data_loader = create_data_loader(data, batch_size=8, stride=3, max_length=4, shuffle=False)
data_iter =iter(data_loader)
first_batch = next(data_iter)

In [87]:
first_batch

[tensor([[10970, 33310,    35, 18379],
         [18379, 15749, 40417,    40],
         [   40,   550,  1464,  1807],
         [ 1807,  3619,   402,   271],
         [  271, 10899,  2138,   257],
         [  257,  7026, 15632,   438],
         [  438,  2016,   556,   702],
         [  702,  5891,  1576,   438]]),
 tensor([[33310,    35, 18379, 15749],
         [15749, 40417,    40,   550],
         [  550,  1464,  1807,  3619],
         [ 3619,   402,   271, 10899],
         [10899,  2138,   257,  7026],
         [ 7026, 15632,   438,  2016],
         [ 2016,   556,   702,  5891],
         [ 5891,  1576,   438,   568]])]