## Exercise 2.1 - Byte pair encoding of unknown words
Try the BPE tokenizer from the tiktoken library on the unknown words “Akwirw ier” and print the individual token IDs. Then, call the decode function on each of the resulting integers in this list to reproduce the mapping shown in figure 2.11. Lastly, call the decode method on the token IDs to check whether it can reconstruct the original input, “Akwirw ier.”

In [1]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
unknown_text = ("Akwirw ier")

In [2]:
token_ids = tokenizer.encode(unknown_text)
print("Individual token IDs: ", token_ids)

Individual token IDs:  [33901, 86, 343, 86, 220, 959]


In [3]:
for i in token_ids:
    print(i, ' ---> ', tokenizer.decode([i]))

33901  --->  Ak
86  --->  w
343  --->  ir
86  --->  w
220  --->   
959  --->  ier


In [4]:
reconstructed_text = tokenizer.decode(token_ids)
print("Reconstructed text through decode method: ", reconstructed_text)

Reconstructed text through decode method:  Akwirw ier


## Exercise 2.2 - Data loaders with different strides and context sizes
To develop more intuition for how the data loader works, try to run it with different settings such as max_length=2 and stride=2, and max_length=8 and stride=2.

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [10]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [11]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

### With max_length = 2, stride = 2

In [33]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=2, max_length=2, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367],
        [2885, 1464]]), tensor([[ 367, 2885],
        [1464, 1807]])]


In [34]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619],
        [ 402,  271]]), tensor([[ 3619,   402],
        [  271, 10899]])]


### With max_length = 8, stride = 2

In [38]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]


In [37]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
