In [None]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    "tensorflow",
    "pandas"
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

In [None]:
import json
import os
import urllib

def download_and_load_file(file_path):
    with open(file_path, "r", encoding='utf-8') as f:
        data = json.load(f)
    
    return data

In [None]:
file_path = "instruction-data.json"
data = download_and_load_file(file_path)
len(data)

In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""
    return instruction_text + input_text

In [None]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.10)

train_data = data[:train_portion]
test_data = data[train_portion: test_portion + train_portion]
val_data = data[train_portion + test_portion:]

len(train_data), len(test_data), len(val_data)

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [None]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []
        for entry in data:
            entry_formatted = format_input(entry)
            response_text = f"\n\n###Response:\n{entry['output']}"
            full_text = entry_formatted + response_text
            self.encoded_text.append(
                tokenizer.encode(full_text)
            )
    
    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.encoded_text)

In [None]:
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra
    # padding token below
    batch_max_length = max(len(item)+1 for item in batch)
    print(batch_max_length)

    # Pad and prepare inputs
    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token
        # that has been added via the +1 setting in batch_max_length
        # (the extra padding token will be relevant in later codes)
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [None]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
out = custom_collate_draft_1([inputs_1, inputs_2, inputs_3])

In [None]:
out.shape; out
out = out[2:].squeeze()
out.shape

In [None]:
pad_token_id = 50256
mask = out == pad_token_id
mask

In [None]:
indices = torch.nonzero(mask).squeeze()
indices.shape

In [None]:
indices.numel()

In [None]:
indices[1:].shape

In [None]:
out[indices[1:]] = -100

In [None]:
out

In [None]:
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100, allowed_max_len=None, device='cpu'):
    batch_max_len = max(len(entry) + 1 for entry in batch)
    input_lst, target_lst = [], []
    
    for item in batch:
        new_item = item.copy()
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_len - len(new_item))
        )
        input = torch.tensor(padded[:-1])
        target = torch.tensor(padded[1:])

        # Only use padding token for first endoftext token
        mask = target == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            target[indices[1:]] = ignore_index
        
        if allowed_max_len is not None:
            input = input[:allowed_max_len]
            target = target[:allowed_max_len]
        
        input_lst.append(input)
        target_lst.append(target)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    
    return input_tensor, target_tensor

In [None]:
input_batch, target_batch = custom_collate_fn([inputs_1, inputs_2, inputs_3])
input_batch, target_batch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
from functools import partial
customized_collate_func = partial(
    custom_collate_fn, device=device, allowed_max_len=1024
)

In [None]:
from torch.utils.data import DataLoader

torch.manual_seed(123)
batch_size = 8

train_dataset = InstructionDataset(train_data, tokenizer)
len(train_dataset)
train_loader = DataLoader(
    train_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=0
)

In [None]:
a = next(iter(train_loader))
a[0].shape, a[1].shape

In [None]:
val_dataset = InstructionDataset(val_data, tokenizer)
len(val_dataset)
val_loader = DataLoader(
    val_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [None]:
a = next(iter(val_loader))
a[0].shape, a[1].shape

In [None]:
test_dataset = InstructionDataset(test_data, tokenizer)
len(test_dataset)
test_loader = DataLoader(
    test_dataset,
    collate_fn=customized_collate_func,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [None]:
a = next(iter(val_loader))
a[0].shape, a[1].shape