In [1]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            print(response)
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding = "utf-8") as file:
            file.write(text_data)

    else: 
        with open(file_path, "r", encoding = "utf-8") as file:
            text_data = file.read()
        with open(file_path, "r") as file:
            data = json.load(file)
        return data

file_path = "instruction-data.json"
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json")

data =  download_and_load_file(file_path, url)
print("Number of entries :", len(data))

    

Number of entries : 1100


In [2]:
def format_input(entry):
    instruction_text= (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes thes request.\n"
        f"\n ### Instruction:\n {entry['instruction']}")

    input_text=(
        f"\n\n ### Input:\n {entry['input']}" if entry['input'] else "")
    return instruction_text+ input_text



In [3]:
sample_input = format_input(data[50])
desired_response = f"\n\n ### Response:\n {data[50]['output']}"
print(sample_input+desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes thes request.

 ### Instruction:
 Identify the correct spelling of the following word.

 ### Input:
 Ocassion

 ### Response:
 The correct spelling is 'Occasion.'


In [4]:
train_portion = int(len(data)*0.85)
test_portion = int(len(data)*0.1)
val_portion = len(data) - train_portion - test_portion 

train_data = data[:train_portion]
test_data = data[train_portion: train_portion+test_portion]
val_data = data[train_portion + test_portion]

print("Training set length:",len(train_data))
print("Validation set length:",len(val_data))
print("Test set length:",len(test_data))

Training set length: 935
Validation set length: 3
Test set length: 110


In [5]:
val_data = [val_data]

### Creating Dataset

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []

        for entry in data :
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n ### Response: \n {entry['output']}"
            full_text = instruction_plus_input+response_text
            self.encoded_text.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.data)

In [14]:
def custom_collate_fn(
        batch, 
        pad_token_id = 50256,
        ignore_index = -100,
        allowed_max_length = None,
        device = "cpu"
):
    batch_max_lenght = max(len(item) for item in batch)
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        padded = (
            new_item+([pad_token_id]*(batch_max_lenght - len(new_item)))
        )
        inputs = torch.tensor(padded)
        targets = torch.tensor(padded[1:] + [pad_token_id])
        
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
   
        ## If condition checks if there more than 1 non zero index indicating
        ## mpore th
        if indices.numel()>1:
            targets[indices[1:]] = ignore_index
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [5,2,4,1]
batch = (inputs_1,
         inputs_2,
         inputs_3)


In [15]:
inputs, targets = custom_collate_fn(batch)

print(inputs)
print(targets)

New item type <class 'list'>
[1, 2, 3, 4] <class 'list'>
[50256] <class 'list'>
New item type <class 'list'>
[6, 50256, 50256, 50256] <class 'list'>
[50256] <class 'list'>
New item type <class 'list'>
[2, 4, 1, 50256] <class 'list'>
[50256] <class 'list'>
tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    5,     2,     4,     1, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    2,     4,     1, 50256,  -100]])


In [10]:
from functools import partial

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
customized_collate_fn = partial(
    custom_collate_fn,
    device = device,
    allowed_max_length = 1024
)

In [16]:
from torch.utils.data import DataLoader

num_workers =0
batch_size = 8

torch.manual_seed(123)
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    num_workers=num_workers,
    shuffle= True,
    collate_fn=customized_collate_fn,
    drop_last=True
)
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    num_workers=num_workers,
    shuffle= True,
    collate_fn=customized_collate_fn,
    drop_last=True
)
test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    num_workers=num_workers,
    shuffle= True,
    collate_fn=customized_collate_fn,
    drop_last=True
)



In [17]:
# Example check for correct format
for i, entry in enumerate(val_data):
    if not isinstance(entry, dict):
        print(f"Entry at index {i} is not a dictionary: {entry}")
    elif not all(key in entry for key in ['instruction', 'input', 'output']):
        print(f"Entry at index {i} is missing required keys: {entry}")


In [18]:
print(f"train loader  :\n")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

train loader  :

[318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 82, 2581, 13, 628, 44386, 46486, 25, 198, 16140, 6525, 262, 1708, 6827, 523, 326, 340, 318, 287, 4075, 3809, 13, 628, 44386, 23412, 25, 198, 383, 12187, 373, 22979, 416, 10490, 13, 628, 44386, 18261, 25, 220, 198, 10490, 22979, 262, 12187, 13] <class 'list'>
[50256] <class 'list'>
[318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 82, 2581, 13, 628, 44386, 46486, 25, 198, 1867, 318, 262, 22918, 286, 26600, 30, 628, 44386, 18261, 25, 220, 198, 383, 22918, 286, 26600, 318, 6032, 1088, 362, 13, 20, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256] <class 'list'>
[50256] <class 'list'>
[318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 82, 2581, 13, 628, 44386, 46486, 25, 198, 1867, 318, 262, 22801, 1296, 286, 705, 65, 7321, 1505, 30960, 628, 44386, 18261, 25, 220, 198, 383, 22801, 

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")


In [None]:
instruction_data = InstructionDataset(train_data, tokenizer)