In [2]:
# Listing 7.1 Downloading the dataset
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else: #A
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "instruction-data.json"
url = ( "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch" "/main/ch07/01_main-chapter-code/instruction-data.json" )
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

#A 如果文件已经下载，就跳过下载过程

Number of entries: 1100


In [3]:
data[1]

{'instruction': 'Edit the following sentence for grammar.',
 'input': 'He go to the park every day.',
 'output': 'He goes to the park every day.'}

In [4]:
print("Another example entry:\n", data[999])

Another example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [6]:
# Listing 7.2 Implementing the prompt formatting function
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    return instruction_text + input_text

In [7]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [8]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [9]:
# Listing 7.3 Partitioning the dataset
train_portion = int(len(data) * 0.85) # 85% for training
test_portion = int(len(data) * 0.1) # 10% for testing
val_portion = len(data) - train_portion - test_portion # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


In [14]:
# Listing 7.4 Implementing an instruction dataset class
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:                                           #A
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

#A 预分词文本

In [15]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


在第 6 章中，我们使用的填充方式是将数据集中的所有示例填充到相同长度。在本章中，我们将采用一种更为精细的方法，开发一个自定义的collate函数并传递给数据加载器。该自定义collate函数会将每个批次中的训练样本填充到相同长度，同时允许不同批次中的样本具有不同的长度，如图 7.8 所示。这种方法通过仅将序列扩展到每个批次中最长的序列长度，从而减少了不必要的填充，避免了对整个数据集进行冗余填充。

In [26]:
def custom_collate(batch,pad_token_id=50256,device="cpu",ignore_index=-100,allow_max_length=None):
    batch_max_length = max(len(item) for item in batch) +1
    input_list = []
    target_list = []

    for item in batch:
        new_item = item.copy()
        padding = new_item + [pad_token_id]

        padding = padding + (batch_max_length - len(padding)) * [pad_token_id]

        input = torch.tensor(padding[:-1])
        target = torch.tensor(padding[1:])

        mask = target == pad_token_id
        indice = torch.nonzero(mask,as_tuple=True)[0]
        if indice.numel() > 1:
            target[indice[1:]] = ignore_index
        
        if allow_max_length is not None:
            input = input[:allow_max_length]
            target = target[:allow_max_length]
            
        target_list.append(target)
        input_list.append(input)
    
    input_tensor = torch.stack(input_list).to(device)
    target_tensor = torch.stack(target_list).to(device)
    return input_tensor,target_tensor

# input_list 里有batch_size个 [batch_max_length - 1] 的一维张量。
# torch.stack 默认在dim=0拼接，所以
# input_tensor的最终shape为 [batch_size, batch_max_length - 1]

In [27]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1,
    inputs_2,
    inputs_3
)
print(custom_collate(batch))

(tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]]), tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]]))


在接下来的步骤中，我们会将所有填充 token 设置为占位值 -100。这个特殊值可以让填充 token 不参与训练损失的计算，从而确保只有有效数据会影响模型的学习。

In [28]:
mask = torch.tensor([
    [True, False],
    [False, True]
])
indices_tuple = torch.nonzero(mask, as_tuple=True)
print(indices_tuple)
# 输出: (tensor([0, 1]), tensor([0, 1]))
# 每个张量都代表一个维度（轴）上所有True元素的坐标。
#这里是一个二维tensor：
# 第一个张量：所有True元素的第0维（行）坐标。
# 第二个张量：所有True元素的第1维（列）坐标。
# 合并起来理解：
# (tensor([0, 1]), tensor([0, 1]))
# 表示：第0行0列为True（即[0,0]），第1行1列为True（即[1,1]）

(tensor([0, 1]), tensor([0, 1]))


In [29]:
logits_1 = torch.tensor(
    [[-1.0, 1.0], # predictions for 1st token
     [-0.5, 1.5]] # predictions for 2nd token
)
targets_1 = torch.tensor([0, 1]) # Correct token indices to generate
loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [None]:
logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]                        #A
)
targets_2 = torch.tensor([0, 1, 1])
loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)
# dim -1的时候，每一行进行softmax
# 然后选择归一化后的值， 【0，1，1】（对应行的正确的值） 表示 -1.0 1.5 1.5
#A 添加第三个 token ID

tensor(0.7936)


In [31]:
targets_3 = torch.tensor([0, 1, -100])
loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)


tensor(1.1269)
loss_1 == loss_3: tensor(True)


在 PyTorch 中，cross_entropy 函数的默认设置是 cross_entropy(..., ignore_index=-100)，这意味着它会忽略标签为 -100 的目标。

我们希望在目标序列中保留一个50256（结束符）token ID，因为这有助于 LLM 学习生成文本结束的标记，进而作为判断回复是否完成的标志。 这就是为什么maxlength= max+1的原因

在实践中，除了遮蔽填充 token 外，还常常将指令部分对应的目标（instruction/input） token ID 一并遮蔽，

通过对指令部分对应的目标 token ID 进行掩码，交叉熵损失仅计算生成响应的目标 token ID，模型在训练时也会专注于生成准确的回答，而不是去记住指令内容，从而有助于减少过拟合。

“把数据转移到GPU这一步提前到DataLoader后台（collate函数创建batch的时候就转移到gpu，而不用等dataloader将数据生成之后再转移），能让训练循环主线程拿到的每个batch已在GPU，显著提升GPU利用率，减少训练瓶颈。”但是要注意防止OOM

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# if torch.backends.mps.is_available():       #A
#     device = torch.device("mps")"           #A
print("Device:", device)

#A 取消这两行注释以在 Apple Silicon 芯片上启用 GPU

Device: cuda


In [None]:
from functools import partial
customized_collate_fn = partial(custom_collate,device = device,allow_max_length=1024)
# y预设参数的版本，如果后续需要再添加参数，只需要给 batch 参数，其他参数（device, allowed_max_length）自动填好了。

In [None]:
# Listing 7.6 Initializing the data loaders
from torch.utils.data import DataLoader

num_workers = 0            #A
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)
#在 PyTorch 的 DataLoader 机制下，你的 collate_fn 
# 总会自动收到一个batch（样本list）作为第一个参数，
# 无需手动传递，其他参数建议用 partial 预先绑定。
test_dataset = InstructionDataset(test_data, tokenizer)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

#A 如果操作系统支持并行的 Python 进程，你可以尝试增加此数值。

In [35]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 68]) torch.

## 7.5 加载预训练的 LLM

In [36]:
# Listing 7.7 Loading the pretrained model
from gpt_download import download_and_load_gpt2
from llms_from_scratch.ch04 import GPTModel
from llms_from_scratch.ch05 import load_weights_into_gpt

BASE_CONFIG = {
    "vocab_size": 50257, # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0, # Dropout rate
    "qkv_bias": True # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 75.5kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.33MiB/s]
hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 91.5kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 1.42G/1.42G [05:42<00:00, 4.14MiB/s] 
model.ckpt.index: 100%|██████████| 10.4k/10.4k [00:00<00:00, 5.19MiB/s]
model.ckpt.meta: 100%|██████████| 927k/927k [00:00<00:00, 1.21MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 707kiB/s] 


GPTModel(
  (tok_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=1024, out_features=1024, bias=True)
        (W_key): Linear(in_features=1024, out_features=1024, bias=True)
        (W_value): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_f

In [37]:
torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [39]:
from llms_from_scratch.ch05 import generate,text_to_token_ids,token_ids_to_text

generate??

[1;31mSignature:[0m
[0mgenerate[0m[1;33m([0m[1;33m
[0m    [0mmodel[0m[1;33m,[0m[1;33m
[0m    [0midx[0m[1;33m,[0m[1;33m
[0m    [0mmax_new_tokens[0m[1;33m,[0m[1;33m
[0m    [0mcontext_size[0m[1;33m,[0m[1;33m
[0m    [0mtemperature[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mtop_k[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0meos_id[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mSource:[0m   
[1;32mdef[0m [0mgenerate[0m[1;33m([0m[0mmodel[0m[1;33m,[0m [0midx[0m[1;33m,[0m [0mmax_new_tokens[0m[1;33m,[0m [0mcontext_size[0m[1;33m,[0m [0mtemperature[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m [0mtop_k[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0meos_id[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m[1;33m
[0m    [1;31m# For-loop is the same as before: Get logits, and only focus on last time step[

In [40]:
token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [43]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the


In [44]:
from llms_from_scratch.ch05 import calc_loss_loader,train_model_simple

In [45]:
calc_loss_loader??

[1;31mSignature:[0m [0mcalc_loss_loader[0m[1;33m([0m[0mdata_loader[0m[1;33m,[0m [0mmodel[0m[1;33m,[0m [0mdevice[0m[1;33m,[0m [0mnum_batches[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mSource:[0m   
[1;32mdef[0m [0mcalc_loss_loader[0m[1;33m([0m[0mdata_loader[0m[1;33m,[0m [0mmodel[0m[1;33m,[0m [0mdevice[0m[1;33m,[0m [0mnum_batches[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [0mtotal_loss[0m [1;33m=[0m [1;36m0.[0m[1;33m
[0m    [1;32mif[0m [0mlen[0m[1;33m([0m[0mdata_loader[0m[1;33m)[0m [1;33m==[0m [1;36m0[0m[1;33m:[0m[1;33m
[0m        [1;32mreturn[0m [0mfloat[0m[1;33m([0m[1;34m"nan"[0m[1;33m)[0m[1;33m
[0m    [1;32melif[0m [0mnum_batches[0m [1;32mis[0m [1;32mNone[0m[1;33m:[0m[1;33m
[0m        [0mnum_batches[0m [1;33m=[0m [0mlen[0m[1;33m([0m[0mdata_loader[0m[1;33m)[0m[1;33m
[0m    [1;32melse[0m

In [46]:
train_model_simple??

[1;31mSignature:[0m
[0mtrain_model_simple[0m[1;33m([0m[1;33m
[0m    [0mmodel[0m[1;33m,[0m[1;33m
[0m    [0mtrain_loader[0m[1;33m,[0m[1;33m
[0m    [0mval_loader[0m[1;33m,[0m[1;33m
[0m    [0moptimizer[0m[1;33m,[0m[1;33m
[0m    [0mdevice[0m[1;33m,[0m[1;33m
[0m    [0mnum_epochs[0m[1;33m,[0m[1;33m
[0m    [0meval_freq[0m[1;33m,[0m[1;33m
[0m    [0meval_iter[0m[1;33m,[0m[1;33m
[0m    [0mstart_context[0m[1;33m,[0m[1;33m
[0m    [0mtokenizer[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mSource:[0m   
[1;32mdef[0m [0mtrain_model_simple[0m[1;33m([0m[0mmodel[0m[1;33m,[0m [0mtrain_loader[0m[1;33m,[0m [0mval_loader[0m[1;33m,[0m [0moptimizer[0m[1;33m,[0m [0mdevice[0m[1;33m,[0m [0mnum_epochs[0m[1;33m,[0m[1;33m
[0m                       [0meval_freq[0m[1;33m,[0m [0meval_iter[0m[1;33m,[0m [0mstart_context[0m[1;33m,[0m [0mtokenizer[0m[

In [47]:
model.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.825909900665283
Validation loss: 3.7619343757629395


In [48]:
# Listing 7.8 Instruction finetuning the pretrained LLM
import time

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.637, Val loss 2.626
Ep 1 (Step 000005): Train loss 1.174, Val loss 1.102
Ep 1 (Step 000010): Train loss 0.872, Val loss 0.944
Ep 1 (Step 000015): Train loss 0.857, Val loss 0.906
Ep 1 (Step 000020): Train loss 0.776, Val loss 0.881
Ep 1 (Step 000025): Train loss 0.754, Val loss 0.859
Ep 1 (Step 000030): Train loss 0.799, Val loss 0.836
Ep 1 (Step 000035): Train loss 0.714, Val loss 0.808
Ep 1 (Step 000040): Train loss 0.672, Val loss 0.806
Ep 1 (Step 000045): Train loss 0.633, Val loss 0.789
Ep 1 (Step 000050): Train loss 0.662, Val loss 0.783
Ep 1 (Step 000055): Train loss 0.760, Val loss 0.763
Ep 1 (Step 000060): Train loss 0.719, Val loss 0.743
Ep 1 (Step 000065): Train loss 0.652, Val loss 0.735
Ep 1 (Step 000070): Train loss 0.532, Val loss 0.729
Ep 1 (Step 000075): Train loss 0.569, Val loss 0.728
Ep 1 (Step 000080): Train loss 0.605, Val loss 0.725
Ep 1 (Step 000085): Train loss 0.509, Val loss 0.709
Ep 1 (Step 000090): Train loss 0.562, Val loss

In [None]:
from llms_from_scratch.ch05 import plot_losses
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

## 7.7 提取并保存响应

In [None]:
torch.manual_seed(123)
for entry in test_data[:3]:                #A
    input_text = format_input(entry)
    token_ids = generate(                  #B
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:",
"").strip()

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")


#A 遍历测试集中的前三个样本
#B 使用在第 7.5 节导入的 generate 函数

In [None]:
# Listing 7.9 Generating test set responses
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:",
"").strip()
    test_data[i]["model_response"] = response_text

with open("instruction-data-with-response.json", "w") as file:
    json.dump(test_data, file, indent=4) # "indent" for pretty-printing

In [None]:
print(test_data[0])

In [None]:
import re

# Remove white spaces and parentheses from file name
file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")
