### Learnings
- collate function
- masking only target's padding tokens to ensure they are ignored during loss computation


#### torch
- torch.nonzero().squeeze()
- torch.numel()
- the default setting in `cross_entropy` by torch is ignore_index=100, that's why, it ignores this label while calculating the loss

In [1]:
!pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [2]:
from urllib.request import urlretrieve
import os
import json
def download_save_dataset(url,file_path):
  if not os.path.exists('./data'):
    os.makedirs('./data')
    urlretrieve(url,file_path)
    print('dataset downloaded and saved')



file_path = "./data/instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

download_save_dataset(url,file_path)

def load_file(file_path):
  with open(file_path,"r") as file:
    data=json.load(file)
  return data

data=load_file(file_path=file_path)
print(f"number of entries {len(data)}")
print(f"Example entry {data[10]}")



dataset downloaded and saved
number of entries 1100
Example entry {'instruction': 'What is the contraction for "will not"?', 'input': '', 'output': 'The contraction for "will not" is "won\'t".'}


### Alpaca style input format

In [3]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n \n### Instruction:\n{entry['instruction']}"
)
    input_text = (
        f"\n \n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [4]:
model_input=format_input(data[10])
desired_response=f"\n \n### Response:\n{data[10]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
What is the contraction for "will not"?
 
### Response:
The contraction for "will not" is "won't".


In [5]:
model_input = format_input(data[999])
desired_response = f"\n \n### Response:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
What is an antonym of 'complicated'?
 
### Response:
An antonym of 'complicated' is 'simple'.


In [6]:
train_ratio=int(0.85*len(data))
test_ratio=int(len(data)*0.1)
val_ratio=len(data) - train_ratio - test_ratio

train_data=data[:train_ratio]
val_data=data[train_ratio + test_ratio:]
test_data=data[train_ratio:train_ratio + test_ratio]

print("trainining data", len(train_data))
print("val data", len(val_data))
print("test data", len(test_data))



trainining data 935
val data 55
test data 110


In [7]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):

    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []

        for entry in self.data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n \n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(tokenizer.encode(full_text))  # [[],[],[]...]

    def __getitem__(self, index):
        # Return the encoded text instead of the dictionary
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


`collate_function` - to make sure each seq within a batch has uniform length, whilst 2 differnt batches can have different lengths

In [9]:
torch.numel(torch.nonzero(torch.tensor([1,2,3,0,4])).squeeze())

4

In [10]:
def custom_collate_fn(batch,pad_token_id=50256,device="cpu",ignore_index=-100,allowed_max_length=None):
  batch_max_length=max(len(seq)+1 for seq in batch) # add +1 to account for the end_of_sequence(padding) id

  inputs_list,targets_list=[],[]

  for item in batch:
    new_item=item.copy()
    new_item= new_item + [pad_token_id]

    padded=(new_item + [pad_token_id]*(batch_max_length-len(new_item)))

    inputs=torch.tensor(padded[:-1])
    targets=torch.tensor(padded[1:])

    #mask padding tokens in the target to ensure they are ignored during loss computation
    mask= targets==pad_token_id
    indices=torch.nonzero(mask).squeeze()
    if indices.numel() > 1:
      targets[indices[1:]]=ignore_index

    if allowed_max_length is not None:
      inputs=inputs[:allowed_max_length]
      targets=targets[:allowed_max_length]

    inputs_list.append(inputs)
    targets_list.append(targets)
  inputs_tensor = torch.stack(inputs_list).to(device)
  targets_tensor = torch.stack(targets_list).to(device)
  return inputs_tensor, targets_tensor


In [11]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1,
    inputs_2,
    inputs_3
)
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
print("Device:", device)

Device: cuda


In [13]:
from functools import partial
customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [14]:
from torch.utils.data import DataLoader

num_workers=0
batch_size=8

torch.manual_seed(42)
train_dataset= InstructionDataset(train_data, tokenizer)
val_dataset= InstructionDataset(val_data, tokenizer)
test_dataset= InstructionDataset(test_data, tokenizer)

train_loader=DataLoader(train_dataset,batch_size=batch_size,num_workers=num_workers,shuffle=True,collate_fn=customized_collate_fn,drop_last=True)
val_loader=DataLoader(val_dataset,batch_size=batch_size,num_workers=num_workers,shuffle=False,collate_fn=customized_collate_fn,drop_last=False)
test_loader=DataLoader(test_dataset,batch_size=batch_size,num_workers=num_workers,shuffle=False,collate_fn=customized_collate_fn,drop_last=False)

In [15]:
print("train loader")
print(len(train_loader))

for inputs,targets in train_loader:
  break
print(inputs.shape)
print(targets.shape)

train loader
116
torch.Size([8, 65])
torch.Size([8, 65])


In [16]:
from transformers import GPT2LMHeadModel

model=GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [17]:
def calculate_loss_batch(input_batch,target_batch,model,device="cpu"):
  input_batch=input_batch.to(device)
  target_batch=target_batch.to(device)
  logits=model(input_batch).logits
  logits = logits.view(-1, logits.size(-1))  # Reshape to (batch_size * sequence_length, vocab_size)
  target_batch = target_batch.view(-1)  # Reshape to (batch_size * sequence_length)


  loss=torch.nn.functional.cross_entropy(logits,target_batch)
  return loss

In [18]:
def calculate_loss_loader(data_loader,model,device,num_batches=None):
  total_loss=0
  if len(data_loader)==0:
    return float("nan")
  elif num_batches is None:
    num_batches=len(data_loader)

  else:
    num_batches=min(num_batches,len(data_loader))

  for i, (input_batch,target_batch) in enumerate(data_loader):

    if i < num_batches:
      loss=calculate_loss_batch(input_batch,target_batch,model,device)
      total_loss+=loss.item()

    else:
      break

  return total_loss / len(data_loader)

In [19]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  model.eval()

  with torch.no_grad():
    train_loss=calculate_loss_loader(train_loader,model,device,num_batches=eval_iter)
    val_loss=calculate_loss_loader(val_loader,model,device,num_batches=eval_iter)

  model.train() #put the model in train mode since it has to continue
  return train_loss,val_loss



In [20]:
def train_model(model,device,train_loader,val_loader,optimizer,num_epochs,eval_freq,eval_iter,tokenizer):

  train_losses,val_losses,track_tokens_seen=[],[],[]
  tokens_seen,global_step=0,-1

  for epoch in range(num_epochs):
    model.train()

    for input_batch,target_batch in train_loader:
      loss=calculate_loss_batch(input_batch,target_batch,model,device)
      optimizer.zero_grad()
      loss.backward()

      optimizer.step()

      tokens_seen+=input_batch.numel()
      global_step+=1

      if global_step%eval_freq==0:
        train_loss,val_loss=evaluate_model( model, train_loader, val_loader, device, eval_iter)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}"
)

    return train_losses, val_losses, track_tokens_seen



2 loss calulcating functions -
1. `calculate_loss_batch` - calcualtes loss for a specific (singular) batch and this loss is .backward()
2. `calculate_loss_loader` - makes use of calculate_loss_batch and calculates loss for whole of the data loader

In the training function,   
- `evaluate_model` - evaluates the model on the whole training_loader & the whole of validation_loader or atleast the number of eval_iter

Inside evaluate_model,
- since this is run for every single batch, use of `calculate_loss_batch` is done


In [40]:
import time

model=model.to(device)

start_time = time.time()
torch.manual_seed(42)

optimizer = torch.optim.AdamW(
    model.parameters(), lr=0.00005, weight_decay=0.1
)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model(
    model,device, train_loader, val_loader, optimizer,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    tokenizer=tokenizer
)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.026, Val loss 0.570
Ep 1 (Step 000005): Train loss 0.029, Val loss 0.583
Ep 1 (Step 000010): Train loss 0.029, Val loss 0.579
Ep 1 (Step 000015): Train loss 0.030, Val loss 0.581
Ep 1 (Step 000020): Train loss 0.028, Val loss 0.579
Ep 1 (Step 000025): Train loss 0.029, Val loss 0.573
Ep 1 (Step 000030): Train loss 0.025, Val loss 0.571
Ep 1 (Step 000035): Train loss 0.025, Val loss 0.573
Ep 1 (Step 000040): Train loss 0.026, Val loss 0.570
Ep 1 (Step 000045): Train loss 0.029, Val loss 0.561
Ep 1 (Step 000050): Train loss 0.027, Val loss 0.557
Ep 1 (Step 000055): Train loss 0.023, Val loss 0.557
Ep 1 (Step 000060): Train loss 0.028, Val loss 0.553
Ep 1 (Step 000065): Train loss 0.027, Val loss 0.551
Ep 1 (Step 000070): Train loss 0.024, Val loss 0.551
Ep 1 (Step 000075): Train loss 0.028, Val loss 0.548
Ep 1 (Step 000080): Train loss 0.024, Val loss 0.544
Ep 1 (Step 000085): Train loss 0.023, Val loss 0.541
Ep 1 (Step 000090): Train loss 0.022, Val loss

In [41]:
def generate(model,idx,max_new_tokens,context_size,temperature=0.0,top_k=None,eos_id=None):
  for _ in range(max_new_tokens):
    idx_cond=idx[:,-context_size:]

    with torch.no_grad():
      logits=model(idx_cond).logits
    logits = logits[:, -1, :]
    if top_k is not None:


      top_logits, _ = torch.topk(logits, top_k)
      min_val = top_logits[:, -1]
      logits = torch.where(
          logits < min_val,
          torch.tensor(float('-inf')).to(logits.device),
          logits
      )
    if temperature > 0.0:
        logits = logits / temperature
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
    else:
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)
    if idx_next == eos_id:
        break
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [42]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)    #1
    return encoded_tensor

In [43]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [44]:
torch.manual_seed(123)
for entry in test_data[:3]:
    input_text = format_input(entry)
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=1024,
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    print(input_text)
    print(f"\nCorrect response:\\n>> {entry['output']}")
    print(f"\nModel response:\\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
Rewrite the sentence using a simile.
 
### Input:
The car is very fast.

Correct response:\n>> The car is as fast as lightning.

Model response:\n>> The car is so fast that it can travel at a very fast speed.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:\n>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:\n>> The type of cloud is a cloud of clouds.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
### Instruction:
Name the author of 'Pride and Prejudice'.

Correct response:\n>> Jane Austen.

Model response:\n>> The author of 'Prid