In [1]:
import json
with open("instruction-data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(data[1])
print(len(data))

{'instruction': 'Edit the following sentence for grammar.', 'input': 'He go to the park every day.', 'output': 'He goes to the park every day.'}
1100


In [10]:
def format_input(item):
    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{item['instruction']}"
    )

    input_text = f"\n\n### Input:\n{item['input']}" if item["input"] else ""

    return instruction_text + input_text

myinput = format_input(data[50])
response = f"\n\n### Response:\n{data[50]['output']}"
print(myinput+response)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'transparent'?

### Response:
An antonym of 'transparent' is 'opaque'.


In [3]:
#8:1:1
train_part = (int)(len(data) *0.8)
val_part = (int)(len(data)*0.1)
test_part = len(data)-train_part - val_part

train_data = data[:train_part]
val_data = data[train_part:train_part+val_part]
test_data = data[train_part+val_part:]

print("train set length:", len(train_data))
print("val set length:", len(val_data))
print("test set length:", len(test_data))

train set length: 880
val set length: 110
test set length: 110


In [4]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.samples = []
        for i in data:
            input = format_input(i)
            response = f"\n\n### Response:\n{i['output']}"
            full_text = input + response
            self.samples.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.samples[index]

    def __len__(self):
        return len(self.data)

In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [6]:
def my_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_token_id=-100,
    allowed_max_length=None,
    device="cpu"
):
    #获取这个批次中最长样本的长度
    batch_max_len = max(len(i)+1 for i in batch)
    #bacth_max_len = max(len(i)+1 for i in batch)
    input_list, target_list = [], []

    for i in batch:
        #将这个批次中小于批次最大长度的所有样本进行填充
        #根据输入创建targets
        #将targets中填充的token_id替换成-100（除第一个填充的tokenid之外）
        new_item = i + [pad_token_id]
        padded = new_item + [pad_token_id] * (batch_max_len - len(new_item))
        #padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        # targets = [1 2 3 50256 50256 ...],
        # mask = [False, False, False, True, True ...]
        mask = targets == pad_token_id
        slice = torch.nonzero(mask).squeeze()
        if slice.numel() > 1:
            targets[slice[1:]] = ignore_token_id
        
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        
        input_list.append(inputs)
        target_list.append(targets)

    inputs_tensor = torch.stack(input_list).to(device)
    #targets_tensor = troch.stack(target_list)
    targets_tensor = torch.stack(target_list)

    return inputs_tensor, targets_tensor

In [7]:
import torch
mask = [False, False, True, True, True]
slice = torch.nonzero(torch.tensor(mask)).squeeze()
targets=torch.tensor([1,2,50256, 50256,50256])
targets[slice[1:]] = -100
print(targets)

tensor([    1,     2, 50256,  -100,  -100])


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from functools import partial
customized_collate_fn = partial(my_collate_fn, 
                                device=device,
                                allowed_max_length=1024)
print(device)

cuda


In [9]:
from torch.utils.data import DataLoader

batch_size=8

torch.manual_seed(123)
train_dataset = InstructionDataset(train_data, tokenizer)
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = True,
    drop_last = True
)

#验证数据集
val_dataset = InstructionDataset(val_data, tokenizer)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False
)

In [10]:
from GPTModel import MyGPTModel, generate_new, text_to_tokenids, tokenids_to_text
from load_gpt2_model import load_gpt2_weights
import torch

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "max_seq_length": 1024,
    "embedding_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True
}
model = MyGPTModel(GPT_CONFIG_124M)

load_gpt2_weights(model, GPT_CONFIG_124M)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"模型已加载至：{device}")



[1/3] 正在从 Hugging Face 下载/加载 gpt2 模型...
[2/3] 开始权重移植...
  -> 正在加载 Embeddings (wte, wpe)...
  -> 正在加载 12 层 Transformer Block...
  -> 正在加载 Final LayerNorm & Head...
[3/3] 成功！GPT-2 权重已全部加载完成。

模型已加载至：cuda


In [11]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
prompt = "OpenAI is"
prompt = text_to_tokenids(prompt, tokenizer).to(device)
tokens = generate_new(model, 
             prompt,
             30,
             GPT_CONFIG_124M["max_seq_length"],
             25,
             1.2)

print(f"output:{tokenids_to_text(tokens, tokenizer)}")

output:OpenAI is the first game to be built specifically for the platform. It's based on a popular AI software, and includes tools for building games to be played on


In [12]:
from GPTModel import calc_loss
with torch.no_grad():
        train_loss = calc_loss(train_dataloader, model, device)
        val_loss = calc_loss(val_dataloader, model, device)


print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: tensor(4.1869, device='cuda:0')
Validation loss: tensor(4.2892, device='cuda:0')


In [13]:
import time
from GPTModel import train_model

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses = train_model(
    model, 
    train_dataloader, 
    val_dataloader, 
    optimizer, 
    device,
    epochs=num_epochs, 
    prompt=format_input(val_data[0]), 
    tokenizer=tokenizer,
    eval_interval=5
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Epoch1 step:         5: Train loss 1.986, Eval loss2.043
Epoch1 step:        10: Train loss 1.254, Eval loss1.320
Epoch1 step:        15: Train loss 1.140, Eval loss1.213
Epoch1 step:        20: Train loss 1.048, Eval loss1.112
Epoch1 step:        25: Train loss 1.002, Eval loss1.066
Epoch1 step:        30: Train loss 0.964, Eval loss1.030
Epoch1 step:        35: Train loss 0.929, Eval loss0.994
Epoch1 step:        40: Train loss 0.899, Eval loss0.974
Epoch1 step:        45: Train loss 0.879, Eval loss0.964
Epoch1 step:        50: Train loss 0.851, Eval loss0.942
Epoch1 step:        55: Train loss 0.830, Eval loss0.932
Epoch1 step:        60: Train loss 0.818, Eval loss0.923
Epoch1 step:        65: Train loss 0.806, Eval loss0.914
Epoch1 step:        70: Train loss 0.790, Eval loss0.902
Epoch1 step:        75: Train loss 0.775, Eval loss0.899
Epoch1 step:        80: Train loss 0.759, Eval loss0.888
Epoch1 step:        85: Train loss 0.745, Eval loss0.878
Epoch1 step:        90: Train l

In [16]:
torch.save(model.state_dict(), "model_sft.pth")

In [17]:
torch.manual_seed(123)
for entry in test_data[:3]:                #A
    input_text = format_input(entry)
    token_ids = generate_new(model,
        text_to_tokenids(input_text, tokenizer).to(device),
        256,
        GPT_CONFIG_124M["max_seq_length"],
        25,
        1.0,
        50256)
    generated_text = tokenids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:","").strip()

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Explain the primary function of the human heart.

Correct response:
>> The primary function of the human heart is to pump blood throughout the body, delivering oxygen and nutrients to tissues and removing carbon dioxide and other wastes.

Model response:
>> The primary function of the heart is to supply blood and to the heart cells. It functioned as a bridge between the blood and the heart.
-------------------------------------
Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Reword the following sentence to the future tense.

### Input:
He is reading a novel inspired by his grandmother.

Correct response:
>> He will be reading a novel inspired by his grandmother.

Model response:
>> He will read the book.


The novel inspired the poem.
-------------------------------------
Below is an instru

In [20]:
from tqdm import tqdm
from GPTModel import generate_new

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = format_input(entry)

    token_ids = generate_new(
        model=model,
        prompt=text_to_tokenids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_seq_size=GPT_CONFIG_124M["max_seq_length"],
        eos_id=50256
    )
    generated_text = tokenids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:",
"").strip()
    test_data[i]["model_response"] = response_text

with open("instruction-data-with-response.json", "w") as file:
    json.dump(test_data, file, indent=4) # "indent" for pretty-printing

100%|██████████| 110/110 [00:19<00:00,  5.68it/s]


In [2]:
import json
import urllib.request

def query_model(prompt, model="qwen3:latest", url="http://127.0.0.1:11434/api/chat"):
    data = {                                                               
        "model": model,
        "option":{
            "seed": 123, # for deterministic responses
            "temperature": 0, # for deterministic responses
        },
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    payload = json.dumps(data).encode("utf-8")                             
    request = urllib.request.Request(url, data=payload, method="POST")     
    request.add_header("Content-Type", "application/json")                 

    response_data = ""
    with urllib.request.urlopen(request) as response:                      
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]
    return response_data

In [3]:
result = query_model("你好，你是谁？")
print(result)

你好！我是通义千问，是阿里巴巴集团旗下的通义实验室自主研发的超大规模语言模型。我能够帮助你回答问题、创作文字、逻辑推理、编程等，也可以进行多轮对话交流。如果你有任何问题或需要帮助，欢迎随时告诉我！


In [None]:
with open("instruction-data-with-response.json", "r", encoding="utf-8") as f:
    jsondata = json.load(f)
#print(data[0])  
for entry in jsondata[:3]:
    prompt = (
        f" 给定一个输入： `{format_input(entry)}` "
        f" 正确的输出为: `{entry['output']}`, "
        f" 模型给的输出为： `{entry['model_response']}`"
        f" 请为模型的输出打分，0表示最差，100表示最好，只给出分数。"
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model_response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n-------------------------")


Dataset response:
>> The primary function of the human heart is to pump blood throughout the body, delivering oxygen and nutrients to tissues and removing carbon dioxide and other wastes.

Model response:
>> The primary function of the human heart is to control blood flow through the body and to maintain oxygenation and function. It is the primary function of the heart to maintain blood flow through the body and to maintain oxygenation and function.

Score:
>> 60

-------------------------

Dataset response:
>> He will be reading a novel inspired by his grandmother.

Model response:
>> He is reading a novel inspired by his grandmother.

Score:
>> 0

-------------------------

Dataset response:
>> The government passed the law.

Model response:
>> The law was passed by the government.

Score:
>> 0

-------------------------


In [None]:
from tqdm import tqdm
def generate_model_scores(json_data):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"给定一个输出： `{format_input(entry)}` "
            f"正确的输出为: `{entry['output']}`, "
            f"模型给的输出为： `{entry['model_response']}`"
            f" 请为模型的输出打分，0表示最差，100表示最好，只给出分数。"                       
        )
        score = query_model(prompt)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue

    return scores

scores = generate_model_scores(jsondata)
print(f"Number of scores: {len(scores)} of {len(jsondata)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

Scoring entries:  73%|███████▎  | 80/110 [09:21<45:42, 91.43s/it]

Could not convert score: 根据用户提供的上下文，模型输出为“Run!”，而用户期望的正确输出为“Run.”。这表明存在一个细微但关键的差异：**标点符号的使用**。

### 分析：
1. **用户期望的输出**：`Run.`（句号结尾）
2. **模型输出**：`Run!`（感叹号结尾）

### 可能的原因：
- **标点符号错误**：模型可能在生成文本时误用了感叹号（!）而非句号（.），这在某些上下文中（如指令或陈述句）是不合适的。
- **上下文理解偏差**：如果用户期望的是一个陈述句（例如“Run.”），而模型生成的是一个命令或强调（如“Run!”），这可能与用户的需求不匹配。

### 建议：
- **检查上下文**：确认用户是否需要特定的标点符号（如句号或感叹号）。例如：
  - 如果是程序指令或命令，`Run!` 可能更符合语气。
  - 如果是陈述或描述，`Run.` 可能更合适。
- **模型调整**：如果用户明确要求句号，可能需要调整模型的标点生成逻辑，确保其符合上下文需求。
- **用户澄清**：如果上下文不明确，建议用户进一步说明需求，以确保输出符合预期。

### 总结：
模型的输出与用户期望的正确输出存在标点符号差异，需根据具体场景判断是否需要修正。如果用户明确需要句号，则模型应调整生成策略以避免此类错误。


Scoring entries: 100%|██████████| 110/110 [11:07<00:00,  6.07s/it]


NameError: name 'test_data' is not defined

In [16]:
print(f"Number of scores: {len(scores)} of {len(jsondata)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")

Number of scores: 109 of 110
Average score: 11.33

