In [1]:
import json

with open("instruction-data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(data[0])

{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}


In [3]:
def format_input(item):
    instruction_text = (
        f"Below is an instruction that describe a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{item['instruction']}"
    )

    input_text = f"\n\n### Input:\n{item['input']}" if item['input'] else ""
    return instruction_text + input_text

myinput = format_input(data[0])
response = f"\n\n### Response:\n{data[0]['output']}"

print(myinput + response)


Below is an instruction that describe a task.Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend

### Response:
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".


### Split Dataset

In [None]:
# 8:1:1 (dataset split)
train_part = int(len(data) * 0.8)
val_part = int(len(data) * 0.1)
test_part = len(data) - train_part - val_part

train_data = data[:train_part]
val_data = data[train_part:train_part + val_part]
test_data = data[train_part + val_part:]

print("train set length: ", len(train_data))
print("val set length: ", len(val_data))
print("test set length: ", len(test_data))


train set length:  880
val set length:  110
test set length:  110


### Dataset

In [7]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.samples = []

        for i in data:
            input = format_input(i)
            response = f"\n\n### Response:\n{i['output']}"
            full_text = input + response
            self.samples.append(
                tokenizer.encode(
                    full_text,
                )
            )
    
    def __getitem__(self, idx):
        return self.samples[idx]

    def __len__(self):
        return len(self.data)


### Data Loader

In [10]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [None]:
def my_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_token_id=-100,
    allowed_max_length=None,
    device="cpu",
):
    """
    Custom batch collation function for language model training.
    
    Pads sequences to the same length, creates input-target pairs for next-token prediction,
    and masks padding tokens in targets to ignore them during loss calculation.
    
    Args:
        batch: A batch of data containing multiple encoded token sequences (lists)
        pad_token_id: Token ID used for padding, default is 50256 (GPT-2's <|endoftext|> token)
        ignore_token_id: Token ID to replace padding tokens in targets, default is -100 (PyTorch's ignore index)
        allowed_max_length: Optional maximum length to truncate sequences. If None, uses batch max length
        device: Device to move the input tensor to, default is "cpu"
    
    Returns:
        tuple: A tuple containing:
            - inputs_tensor (torch.Tensor): Input sequences with shape (batch_size, max_length)
            - targets_tensor (torch.Tensor): Target sequences with shape (batch_size, max_length).
              Padding positions in targets are set to ignore_token_id to be ignored in loss calculation
    """
    batch_max_length = max(len(i) + 1 for i in batch)
    input_list, target_list = [], []

    for i in batch:
        # append the item less than batch_max_length, padding
        # create targets
        # replace the pad_token_id with ignore_token_id (-100)
        new_item = i + [pad_token_id]
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))

        inputs = torch.tensor(padded[:-1]) # remove the last token
        targets = torch.tensor(padded[1:])

        # targets       = [1 2 3 50256 50256 ...]
        # mask          = [False False False True True ...]
        # final targets = [ 1 2 50256 -100 -100]
        mask = targets == pad_token_id
        slice = torch.nonzero(mask).squeeze()
        if slice.numel() > 1:
            targets[slice[1:]] = ignore_token_id

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        input_list.append(inputs)
        target_list.append(targets)

    # convert to 2D
    inputs_tensor = torch.stack(input_list).to(device)
    targets_tensor = torch.stack(target_list)

    return inputs_tensor, targets_tensor

In [15]:
import torch

mask = [False, False, True, True, True]
print(torch.nonzero(torch.tensor(mask)).squeeze())

slice = torch.nonzero(torch.tensor(mask)).squeeze()
targets = torch.tensor([1, 2, 50256, 50256, 50256])
targets[slice[1:]] = -100
print(targets)


tensor([2, 3, 4])
tensor([    1,     2, 50256,  -100,  -100])


In [None]:
from torch.utils.data import DataLoader

batch_size = 4
torch.manual_seed(123)

# Train dataset
train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=my_collate_fn,
    shuffle=True,
    drop_last=True
)

# Validation dataset
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=my_collate_fn,
    shuffle=False,
    drop_last=False
)

# Test dataset
test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=my_collate_fn,
    shuffle=False,
    drop_last=False
)

