In [5]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            print(response)
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding = "utf-8") as file:
            file.write(text_data)

    else: 
        with open(file_path, "r", encoding = "utf-8") as file:
            text_data = file.read()
        with open(file_path, "r") as file:
            data = json.load(file)
        return data

file_path = "instruction-data.json"
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json")

data =  download_and_load_file(file_path, url)
print("Number of entries :", len(data))

    

Number of entries : 1100


In [6]:
def format_input(entry):
    instruction_text= (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes thes request.\n"
        f"\n ### Instruction:\n {entry['instruction']}")

    input_text=(
        f"\n\n ### Input:\n {entry['input']}" if entry['input'] else "")
    return instruction_text+ input_text



In [7]:
sample_input = format_input(data[50])
desired_response = f"\n\n ### Response:\n {data[50]['output']}"
print(sample_input+desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes thes request.

 ### Instruction:
 Identify the correct spelling of the following word.

 ### Input:
 Ocassion

 ### Response:
 The correct spelling is 'Occasion.'


In [8]:
train_portion = int(len(data)*0.85)
test_portion = int(len(data)*0.1)
val_portion = len(data) - train_portion - test_portion 

train_data = data[:train_portion]
test_data = data[train_portion: train_portion+test_portion]
val_data = data[train_portion + test_portion]

print("Training set length:",len(train_data))
print("Validation set length:",len(val_data))
print("Test set length:",len(test_data))

Training set length: 935
Validation set length: 3
Test set length: 110


### Creating Dataset

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []

        for entry in data :
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n ### Response: \n {entry['output']}"
            full_text = instruction_plus_input+response_text
            self.encoded_text.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.data)

In [40]:
def custom_collate_draft_1(batch, pad_token_id = 50256, device = "cpu"):
    batch_max_lenght = max(len(item) for item in batch)
    inputs_lst = []
    for item in batch:
        new_item = item.copy()
        #new_item+= [pad_token_id]
        print(f" length of the batch : {len(new_item)}")
        print(f"Difference : {batch_max_lenght - len(new_item)}")
        
        padded = (
            new_item+([pad_token_id]*(batch_max_lenght - len(new_item)))
        )
        print(padded)

        inputs = torch.tensor(padded)
        print(f"inputs : {inputs}")
        inputs_lst.append(inputs)

    print(len(inputs_lst))
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [43]:
inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [5,2,4,1]
batch = (inputs_1,
         inputs_2,
         inputs_3)

print(custom_collate_draft_1(batch))

 length of the batch : 5
Difference : 0
[0, 1, 2, 3, 4]
inputs : tensor([0, 1, 2, 3, 4])
 length of the batch : 2
Difference : 3
[5, 6, 50256, 50256, 50256]
inputs : tensor([    5,     6, 50256, 50256, 50256])
 length of the batch : 4
Difference : 1
[5, 2, 4, 1, 50256]
inputs : tensor([    5,     2,     4,     1, 50256])
3
tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    5,     2,     4,     1, 50256]])


In [48]:
def custom_collate_draft_2(
        batch, 
        pad_token_id = 50256,
        device = "cpu"
):
    batch_max_lenght = max(len(item) for item in batch)
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()

        padded = (
            new_item+([pad_token_id]*(batch_max_lenght - len(new_item)))
        )
        inputs = torch.tensor(padded)
        targets = torch.tensor(padded[1:] + [pad_token_id])

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [5,2,4,1]
batch = (inputs_1,
         inputs_2,
         inputs_3)
inputs, targets = custom_collate_draft_2(batch)

print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    5,     2,     4,     1, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    2,     4,     1, 50256, 50256]])


In [52]:
def custom_collate_draft_2(
        batch, 
        pad_token_id = 50256,
        ignore_index = -100,
        allowed_max_length = None,
        device = "cpu"
):
    batch_max_lenght = max(len(item) for item in batch)
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()

        padded = (
            new_item+([pad_token_id]*(batch_max_lenght - len(new_item)))
        )
        inputs = torch.tensor(padded)
        targets = torch.tensor(padded[1:] + [pad_token_id])

        mask = targets == pad_token_id
        print(mask)
        indices = torch.nonzero(mask).squeeze()
        print(indices)
        if indices.numel()>1:
            targets[indices[1:] == ignore_index]

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [5,2,4,1]
batch = (inputs_1,
         inputs_2,
         inputs_3)
inputs, targets = custom_collate_draft_2(batch)

print(inputs)
print(targets)

tensor([False, False, False, False,  True])
tensor(4)
tensor([False,  True,  True,  True,  True])
tensor([1, 2, 3, 4])


IndexError: The shape of the mask [3] at index 0 does not match the shape of the indexed tensor [5] at index 0

In [11]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")


In [10]:
instruction_data = InstructionDataset(train_data, tokenizer)

TypeError: __init__() missing 2 required positional arguments: 'data' and 'tokenizer'