# PJ6

## data & model 
* dataset define

In [None]:

import torch
from torch.utils.data import Dataset
class GSM8KDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt="Answer the following math problem:", max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prompt = prompt

        # 预先对 prompt 进行编码
    
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question =self.prompt + item["question"] 
        answer = item["answer"]

        # 对问题进行tokenization，保持最大长度
        inputs = self.tokenizer(question, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "answer": answer
        }

In [5]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re

ds = load_dataset("openai/gsm8k", "main", cache_dir="F:/dataset")
test_ds = ds["test"]
train_ds = ds["train"]

# 创建一个设备对象
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前设备: {device}")
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
cache_directory = "F:/model"  # 指定你想要的缓存目录

# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=device,
    cache_dir=cache_directory  # 添加此行
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory,device_map=device)  # 添加此行

当前设备: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 结果处理函数

In [None]:
from torch.utils.data import DataLoader
prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

    - Identify the known information and the question being asked.
    - Break the solution into logical steps, providing clear explanations for each.
    - Show all calculations and intermediate results.
    - Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
'''
dataset = GSM8KDataset(train_ds, tokenizer,prompt=prompt)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"

def extract_answer(completion):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS
    
def is_correct(generated_answer, true_answer ):
    generated_answer = extract_answer(generated_answer)
    if generated_answer == INVALID_ANS :
        return False
    
    return extract_answer(true_answer) == gt_answer




# 计算总的准确率
correct = 0
total = 0

# 推理并评估准确率
for batch in dataloader:
    print(batch["input_ids"].size())
    Question = batch["input_ids"].to(device)
    
    attention_mask = batch["attention_mask"].to(device)
    true_answers = batch["answer"]
    
    # 推理
    with torch.no_grad():
        # print(Question)
        generated_ids = model.generate(
            Question,
            max_new_tokens=256
        )
        ## 删除输入信息
        outputs = [
            output_ids[len(Question):] for Question, output_ids in zip(Question, generated_ids)
        ]
        
    

    



torch.Size([1, 512])
["You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:\n\n    - Identify the known information and the question being asked.\n    - Break the solution into logical steps, providing clear explanations for each.\n    - Show all calculations and intermediate results.\n    - Conclude with a final answer.\n\nOutput your response in the following format:\n[Explanation and calculations]\n#### [Final numerical answer]   \n\nHere is the problem:\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?Human beings are like that. \n\nThey are very intelligent but also very stupid. \n\nI think they will always be like that. \n\nI don't know what I'm talking about because I have never met them. \n\nI wish I could understand why peop

KeyboardInterrupt: 

## SFT
* 全参数微调

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
train_dataset = GSM8KDataset(ds['train'], tokenizer)
test_dataset = GSM8KDataset(ds['test'], tokenizer)

# 配置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=3,
    eval_steps=3,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 训练
trainer.train()

# 保存模型
trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

