In [14]:
import json
import os
import urllib
from tqdm import tqdm
 
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data
 
file_path = "instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
 
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [15]:
print(data[0])

{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}


In [16]:
def format_input(entry):

    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""
    return instruction_text + input_text

def format_output(entry):
    return f"\n\n### Response:\n{entry['output']}"

In [17]:
input = format_input(data[50])
output = f"\n\n### Response:\n{data[50]['output']}"

print(input + output)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


# Setup dataloader

In [18]:
train_portion = int(len(data) * 0.85) 
valid_portion = int(len(data) * 0.05) 
test_portion = int(len(data) * 0.1) 

train_data = data[:train_portion]
test_data = data[train_portion: train_portion + test_portion]
valid_data = data[train_portion + test_portion:]

print('len train data = ', len(train_data))
print('len test data = ', len(test_data))
print('len valid data = ', len(valid_data))



len train data =  935
len test data =  110
len valid data =  55


In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={'<|endoftext|>'}))


class InstructionDataset(Dataset):
    def __init__(self,data, tokenizer):
        self.data = data
        self.encoded_text = []

        for entry in data:

            instruction_input = format_input(entry)
            instruction_output = format_output(entry)
            full_text = instruction_input + instruction_output
            self.encoded_text.append(tokenizer.encode(full_text))
    
    def __getitem__(self, index):
        return self.encoded_text[index]
    
    def __len__(self):
        return len(self.encoded_text)

dataset = InstructionDataset(data, tokenizer)

print(len(dataset))
print(dataset[0])



[50256]
1100
[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 16594, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 36, 2100, 4985, 262, 1708, 9546, 416, 25449, 340, 656, 262, 24993, 1813, 13, 198, 198, 21017, 23412, 25, 198, 19503, 521, 14610, 1545, 198, 198, 21017, 18261, 25, 198, 464, 24993, 286, 262, 1813, 9546, 366, 19503, 521, 1, 318, 11491, 11, 262, 3376, 24993, 318, 366, 6726, 1911]


In [20]:
def collate_fn(batch, pad_token_id = 50256, ignore_index = -100, allowed_max_len = None, device = 'cpu'):
    #why ignore_index = -100? 

    batch_max_len = max(len(item) + 1 for item in batch)
    input_lst, target_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id] #this is for contructing target with 1 token shifted to the right
        padded = new_item + [pad_token_id] * (batch_max_len - len(new_item))

        input = torch.tensor(padded[:-1])
        target = torch.tensor(padded[1:])

        mask = target == pad_token_id

        # the reason we want this is because we want to keep the 1st
        # endoftext token but replace the rest. therefore, indices[1:]
        # other approach like a[a == mask] = replace_value 
        # or torch.where() works only if you want to replace ALL ELEMENTS

        indicies = torch.nonzero(mask).squeeze() 
        if indicies.numel() > 1:
            target[indicies[1:]] = ignore_index
        
        if allowed_max_len != None: 
            input = input[:allowed_max_len]
            target = target[:allowed_max_len]

        input_lst.append(input)
        target_lst.append(target)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    
    return input_tensor, target_tensor

input_tensor, target_tensor = collate_fn(dataset[:10])
print(input_tensor.shape, ', ', input_tensor[1])
print(target_tensor.shape, ', ', target_tensor[1])





torch.Size([10, 74]) ,  tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 16594,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 18378,   262,  1708,  6827,   329, 23491,
           13,   198,   198, 21017, 23412,    25,   198,  1544,   467,   284,
          262,  3952,   790,  1110,    13,   198,   198, 21017, 18261,    25,
          198,  1544,  2925,   284,   262,  3952,   790,  1110,    13, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256])
torch.Size([10, 74]) ,  tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 16594,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198, 18378,   262,  1708,  6827,   329, 23491,    13,
          198,   198, 21017, 23412,    25,   198,  1544,   467,   284,   262,
         3952,   790,  1110,    13,   198,   198, 21017, 

In [21]:
print(dataset[1])

[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 16594, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 18378, 262, 1708, 6827, 329, 23491, 13, 198, 198, 21017, 23412, 25, 198, 1544, 467, 284, 262, 3952, 790, 1110, 13, 198, 198, 21017, 18261, 25, 198, 1544, 2925, 284, 262, 3952, 790, 1110, 13]


In [22]:
from functools import partial

customized_collate_fn = partial(collate_fn, device = 'cpu', allowed_max_len = 1024)

In [23]:
num_workers = 0
batch_size = 8

train_dataset = InstructionDataset(train_data, tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = True,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = True, 
    num_workers=num_workers
)

val_dataset = InstructionDataset(valid_data, tokenizer)

valid_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = False,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = False, 
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)

test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_fn,
    shuffle = False,
    #last entry will have different shape, like normal batch = (10,100), last entry will have (8,100). 
    # This cause training error in the middle of training
    drop_last = False, 
    num_workers=num_workers
)

print('len train dataset = ', len(train_dataset))
print('len valid dataset = ', len(val_dataset))
print('len test dataset = ', len(test_dataset))


#sample shape
for i, (X,y) in enumerate(train_dataloader):
    print('X.shape = ', X.shape, ', y.shape = ',y.shape)
    print()
    break

len train dataset =  935
len valid dataset =  55
len test dataset =  110
X.shape =  torch.Size([8, 61]) , y.shape =  torch.Size([8, 61])



# Download pretrained GPT2 Medium model

- 355M params, 1.42gb

In [24]:
BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

In [26]:
gpt = torch.load(os.path.join("output", 'gpt2-small.torch'))
print(gpt)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [27]:
input_text = format_input(valid_data[0])
print(input_text)


Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [28]:
from llm.previous_chapters import *

In [29]:
token_ids = generate(
    model = gpt,
    idx = text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG['context_length'],
    eos_id=50256
)
print('input ', token_ids)

generated_text = token_ids_to_text(token_ids, tokenizer)

input  tensor([[21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 16594,
           257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
         21017, 46486,    25,   198,  3103,  1851,   262,  4075,  6827,   284,
         14513,    25,   705,   464, 21221, 38383,   262,  9799,   790,  1110,
          2637,   198,   198, 21017, 46486,    25,   198,   198,  3103,  1851,
           262,  4075,  6827,   284, 14513,    25,   705,   464, 21221, 38383,
           262,  9799,   790,  1110,  2637,   198,   198, 21017, 46486,    25,
           198,   198,  3103,  1851,   262,  4075]])


In [30]:
print(len(input_text))

199


In [31]:
output = generated_text[len(input_text):].strip()
print(output)

### Instruction:

Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Instruction:

Convert the active


# Training procedure

In [34]:
def train(
    model: nn.Module, 
    train_loader: DataLoader, 
    val_loader: DataLoader, 
    optimizer, 
    device: torch.device, 
    num_epochs, 
    eval_freq,
    eval_iter, 
    start_context: str, 
    tokenizer)->None:

    model.to(device)

    train_losses, val_losses, track_tokens_seen = [], [], []

    tokens_seen, global_steps = 0, -1 #NOTE: why need a token_seen?
    loop = tqdm(range(num_epochs))


    for epoch in loop:
        model.train()

        for x,y  in train_loader:
            optimizer.zero_grad()

            loss = calc_loss_batch(x, y, model, device)

            loss.backward()
            optimizer.step()

            tokens_seen += x.numel()
            global_steps += 1

            if global_steps % eval_freq == 0: 
                train_loss, val_loss = eval_model(model, train_loader, val_loader, device, eval_iter)

                train_losses.append(train_loss)
                val_losses.append(val_loss)

                track_tokens_seen.append(tokens_seen)

                loop.set_description(f"ep {epoch}, train_loss={train_loss:.3f}, val_loss={val_loss:.3f}")
        # generate a sample every epoch
        generate_and_print_sample(model, tokenizer, device, start_context)
    
    return train_losses, val_losses, tokens_seen

def generate_and_print_sample(model: GPTModel, tokenizer, device, start_context) -> None:

    model.eval()

    context_size = model.pos_emb.weight.shape[0]

    encoded = text_to_token_ids(start_context, tokenizer).to(device)

    with torch.no_grad():

        token_ids = generate(
            model, 
            encoded, 
            max_new_tokens=1000, 
            context_size=BASE_CONFIG['context_length'], 
            top_k = 50, 
            temperature=1.5, 
            eos_id = 50256)

        decoded_text = token_ids_to_text(token_ids, tokenizer)

        print('decoded text = ', decoded_text)
    
    model.train()

# Training

In [36]:
import time

start_time = time.time()
torch.manual_seed(123)
lr = 5e-5

optimizer = torch.optim.AdamW(gpt.parameters(), lr = lr, weight_decay=0.1)
num_epochs = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'


train_losses, val_losses, token_seen = train(
    gpt, 
    train_dataloader, 
    valid_dataloader, 
    optimizer, 
    device, num_epochs, eval_freq=5, eval_iter = 5, 
    start_context= format_input(valid_data[0]),
    tokenizer=tokenizer
)

end_time = time.time()

print(f"training completed in {((end_time - start_time)/60)}")



ep 0, train_loss=0.168, val_loss=0.880:   5%|▌         | 1/20 [00:22<07:13, 22.80s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal runs daily within the company's exclusive executive team.


ep 1, train_loss=0.159, val_loss=0.935:  10%|█         | 2/20 [00:45<06:46, 22.59s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day by the chef Cook the Meal is prepared by the chef.


ep 2, train_loss=0.149, val_loss=0.932:  15%|█▌        | 3/20 [01:07<06:22, 22.51s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is cooked by the chef.


ep 3, train_loss=0.150, val_loss=0.921:  20%|██        | 4/20 [01:30<06:00, 22.54s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day by the chef of the Michelin-starred restaurant 'The Michelin-starred restaurant' is composed entirely of chef chefs.


ep 4, train_loss=0.144, val_loss=0.970:  25%|██▌       | 5/20 [01:52<05:37, 22.51s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The chef cooks the meal every day.


ep 5, train_loss=0.139, val_loss=0.957:  30%|███       | 6/20 [02:15<05:16, 22.64s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal is cooked by the chef every day.


ep 6, train_loss=0.149, val_loss=0.974:  35%|███▌      | 7/20 [02:38<04:54, 22.63s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is called 'The Chef's Table' and consists of baked foods, vegetables, and fruits.


ep 7, train_loss=0.134, val_loss=0.962:  40%|████      | 8/20 [03:00<04:31, 22.60s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is prepared by the chef.


ep 8, train_loss=0.136, val_loss=0.972:  45%|████▌     | 9/20 [03:23<04:08, 22.61s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day by the chef is prepared by the chef.


ep 9, train_loss=0.138, val_loss=0.988:  50%|█████     | 10/20 [03:45<03:45, 22.59s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is composed of fruits and vegetables.


ep 10, train_loss=0.138, val_loss=0.995:  55%|█████▌    | 11/20 [04:08<03:24, 22.69s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day by the chef is 'The chef cooked the meal every day.'


ep 11, train_loss=0.139, val_loss=0.974:  60%|██████    | 12/20 [04:31<03:01, 22.66s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal is every day.


ep 12, train_loss=0.136, val_loss=0.979:  65%|██████▌   | 13/20 [04:54<02:38, 22.65s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is called the 'eating' mission.


ep 13, train_loss=0.129, val_loss=0.995:  70%|███████   | 14/20 [05:16<02:15, 22.63s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is cooked by the chef.


ep 14, train_loss=0.132, val_loss=1.004:  75%|███████▌  | 15/20 [05:39<01:53, 22.64s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is staged production, with participants watching as the chef cooked the meal.


ep 15, train_loss=0.129, val_loss=0.984:  80%|████████  | 16/20 [06:02<01:30, 22.71s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal is typically prepared by the chef.


ep 16, train_loss=0.129, val_loss=1.017:  85%|████████▌ | 17/20 [06:24<01:07, 22.66s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is composed of baked goods and artisanal meal choices.


ep 17, train_loss=0.133, val_loss=1.030:  90%|█████████ | 18/20 [06:47<00:45, 22.65s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal is every day by the chef every week.


ep 18, train_loss=0.135, val_loss=1.013:  95%|█████████▌| 19/20 [07:09<00:22, 22.64s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day by the chef is prepared by the chef.


ep 19, train_loss=0.134, val_loss=1.026: 100%|██████████| 20/20 [07:32<00:00, 22.63s/it]

decoded text =  Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:
The meal every day is cooked by the chef.
training completed in 7.542075554529826



