# PJ6

## data & model 
* dataset define

In [None]:
class GSM8KDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt="解答以下数学问题：", max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prompt = prompt

        # 预先对 prompt 进行编码
        self.prompt_ids = tokenizer(self.prompt, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")["input_ids"].squeeze(0)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item["question"]
        answer = item["answer"]

        # 对问题进行tokenization，保持最大长度
        inputs = self.tokenizer(question, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        # 拼接编码后的 prompt 和问题的 input_ids
        input_ids = torch.cat((self.prompt_ids, inputs["input_ids"].squeeze(0)), dim=0)
        attention_mask = torch.cat((torch.ones_like(self.prompt_ids), inputs["attention_mask"].squeeze(0)), dim=0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "answer": answer
        }

In [None]:
from datasets import load_dataset

ds = load_dataset("openai/gsm8k", "main", cache_dir="F:/dataset")

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re
# 创建一个设备对象
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前设备: {device}")
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
cache_directory = "F:/model"  # 指定你想要的缓存目录

# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=device,
    cache_dir=cache_directory  # 添加此行
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory,device_map=device)  # 添加此行

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

当前设备: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
prompt = "解答以下数学问题："
dataset = GSM8KDataset(test_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, prompt=prompt,shuffle=False)

# 评估函数：比较模型生成的答案与真实答案
def evaluate_accuracy(pred, true):
    return pred.strip() == true.strip()

# 计算总的准确率
correct = 0
total = 0

# 推理并评估准确率
for batch in dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    true_answers = batch["answer"]
    
    # 推理
    with torch.no_grad():
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=100)
    
    # 解码生成的答案
    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # 检查生成的答案与真实答案是否匹配
    for generated_answer, true_answer in zip(generated_answers, true_answers):
        if evaluate_accuracy(generated_answer, true_answer):
            correct += 1
        total += 1

    # 打印进度
    print(f"已处理 {total} 条数据")

# 计算准确率
accuracy = correct / total
print(f"模型准确率: {accuracy:.4f}")