In [3]:
import json
import os
import urllib
 
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data
 
file_path = "instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
 
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [4]:
print(data[0])


{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}


In [5]:
def format_input(entry):

    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""
    return instruction_text + input_text

def format_output(entry):
    return f"\n\n### Response:\n{entry['output']}"

In [6]:
input = format_input(data[50])
output = f"\n\n### Response:\n{data[50]['output']}"

print(input + output)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


# Setup dataloader

In [7]:
train_portion = int(len(data) * 0.85) 
valid_portion = int(len(data) * 0.05) 
test_portion = int(len(data) * 0.1) 

train_data = data[:train_portion]
test_data = data[train_portion: train_portion + test_portion]
valid_data = data[train_portion + test_portion:]

print('len train data = ', len(train_data))
print('len test data = ', len(test_data))
print('len valid data = ', len(valid_data))



len train data =  935
len test data =  110
len valid data =  55


In [31]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={'<|endoftext|>'}))


class InstructionDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.encoded_text = []

        for entry in data:

            instruction_input = format_input(entry)
            instruction_output = format_output(entry)
            full_text = instruction_input + instruction_output
            self.encoded_text.append(tokenizer.encode(full_text))
    
    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.encoded_text)

dataset = InstructionDataset(data, tokenizer)

print(len(dataset))
print(dataset[0])



[50256]
1100
[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 16594, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 36, 2100, 4985, 262, 1708, 9546, 416, 25449, 340, 656, 262, 24993, 1813, 13, 198, 198, 21017, 23412, 25, 198, 19503, 521, 14610, 1545, 198, 198, 21017, 18261, 25, 198, 464, 24993, 286, 262, 1813, 9546, 366, 19503, 521, 1, 318, 11491, 11, 262, 3376, 24993, 318, 366, 6726, 1911]


In [28]:
def collate_fn(batch, pad_token_id = 50256, ignore_index = -100, allowed_max_len = None, device = 'cpu'):
    #why ignore_index = -100? 

    batch_max_len = max(len(item) + 1 for item in batch)
    input_lst, target_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id] #this is for contructing target with 1 token shifted to the right
        padded = new_item + [pad_token_id] * (batch_max_len - len(new_item))

        input = torch.tensor(padded[:-1])
        target = torch.tensor(padded[1:])

        mask = target == pad_token_id

        # the reason we want this is because we want to keep the 1st
        # endoftext token but replace the rest. therefore, indices[1:]
        # other approach like a[a == mask] = replace_value 
        # or torch.where() works only if you want to replace ALL ELEMENTS

        indicies = torch.nonzero(mask).squeeze() 
        if indicies.numel() > 1:
            target[indicies[1:]] = ignore_index
        
        if allowed_max_len != None: 
            input = input[:allowed_max_len]
            target = target[:allowed_max_len]

        input_lst.append(input)
        target_lst.append(target)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    
    return input_tensor, target_tensor

input_tensor, target_tensor = collate_fn(dataset[:10])
print(input_tensor.shape, ', ', input_tensor[1])
print(target_tensor.shape, ', ', target_tensor[1])





torch.Size([10, 74]) ,  tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 16594,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 18378,   262,  1708,  6827,   329, 23491,
           13,   198,   198, 21017, 23412,    25,   198,  1544,   467,   284,
          262,  3952,   790,  1110,    13,   198,   198, 21017, 18261,    25,
          198,  1544,  2925,   284,   262,  3952,   790,  1110,    13, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256])
torch.Size([10, 74]) ,  tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 16594,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198, 18378,   262,  1708,  6827,   329, 23491,    13,
          198,   198, 21017, 23412,    25,   198,  1544,   467,   284,   262,
         3952,   790,  1110,    13,   198,   198, 21017, 

In [24]:
print(dataset[1])

[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 16594, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 18378, 262, 1708, 6827, 329, 23491, 13, 198, 198, 21017, 23412, 25, 198, 1544, 467, 284, 262, 3952, 790, 1110, 13, 198, 198, 21017, 18261, 25, 198, 1544, 2925, 284, 262, 3952, 790, 1110, 13]


In [30]:
from functools import partial

customized_collate_fn = partial(collate_fn, device = 'cpu', allowed_max_len = 1024)

In [44]:
num_workers = 0
batch_size = 8

train_dataset = InstructionDataset(train_data, tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = True,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = True, 
    num_workers=num_workers
)

val_dataset = InstructionDataset(valid_data, tokenizer)

valid_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = False,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = False, 
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)

test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = False,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = False, 
    num_workers=num_workers
)

print('len train dataset = ', len(train_dataset))
print('len valid dataset = ', len(val_dataset))
print('len test dataset = ', len(test_dataset))


#sample shape
for i, (X,y) in enumerate(train_dataloader):
    print('X.shape = ', X.shape, ', y.shape = ',y.shape)
    print()
    break




len train dataset =  935
len valid dataset =  55
len test dataset =  110
X.shape =  torch.Size([8, 65]) , y.shape =  torch.Size([8, 65])



# Download pretrained GPT2 Medium model

- 355M params, 1.42gb

In [None]:
# from gpt_download import download_and_load_gpt2
# from chapter04 import GPTModel
# from chapter05 import load_weights_into_gpt
 
# BASE_CONFIG = {
#     "vocab_size": 50257,     # Vocabulary size
#     "context_length": 1024,  # Context length
#     "drop_rate": 0.0,        # Dropout rate
#     "qkv_bias": True         # Query-key-value bias
# }
 
# model_configs = {
#     "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
#     "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
#     "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
#     "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
# }
 
# CHOOSE_MODEL = "gpt2-medium (355M)"
# BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
 
# model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
# settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
 
# model = GPTModel(BASE_CONFIG)
# load_weights_into_gpt(model, params)
# model.eval()