# PJ6

## data & model 
* dataset define

In [3]:

import torch
from torch.utils.data import Dataset
class GSM8KDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt="Answer the following math problem:", max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prompt = prompt

        # 预先对 prompt 进行编码
        self.prompt_ids = tokenizer(self.prompt, padding='do_not_pad', truncation=True, max_length=self.max_length, return_tensors="pt")["input_ids"].squeeze(0)
        print(self.prompt_ids)
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        question = item["question"]
        answer = item["answer"]

        # 对问题进行tokenization，保持最大长度
        inputs = self.tokenizer(question, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        # 拼接编码后的 prompt 和问题的 input_ids
        input_ids = torch.cat((self.prompt_ids, inputs["input_ids"].squeeze(0)), dim=0)
        attention_mask = torch.cat((torch.ones_like(self.prompt_ids), inputs["attention_mask"].squeeze(0)), dim=0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "answer": answer
        }

In [4]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re

ds = load_dataset("openai/gsm8k", "main", cache_dir="F:/dataset")
test_ds = ds["test"]
train_ds = ds["train"]

# 创建一个设备对象
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前设备: {device}")
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
cache_directory = "F:/model"  # 指定你想要的缓存目录

# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=device,
    cache_dir=cache_directory  # 添加此行
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory,device_map=device)  # 添加此行

Using the latest cached version of the dataset since openai/gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at F:\dataset\openai___gsm8k\main\0.0.0\e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Thu Nov 28 12:08:49 2024).


当前设备: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 结果处理函数

In [None]:
from torch.utils.data import DataLoader
prompt = '''You are a helpful math assistant. You will be given a math problem and you will answer it. 
Question : Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer : Natalia sold 48/2 = <<48/2=24>>24 clips in May. Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May. This answer is 72.

Question : What is the sum of the first 10 even numbers?
Answer : The sum of the first 10 even numbers is 2+4+6+8+10+12+14+16+18+20 = <<2+4+6+8+10+12+14+16+18+20=78>>78. This answer is 78.

'''
dataset = GSM8KDataset(train_ds, tokenizer,prompt=prompt)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"
COMPLETE_RE= re.compile(r"This answer is (\-?[0-9\.\,]+)")

def extract_answer(completion):
    match = ANS_RE.search(completion)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS
    
def is_correct(generated_answer, true_answer ):
    generated_answer = extract_answer(generated_answer)
    if generated_answer == INVALID_ANS :
        return False
    
    return extract_answer(true_answer) == gt_answer




# 计算总的准确率
correct = 0
total = 0

# 推理并评估准确率
for batch in dataloader:
    print(batch["input_ids"].size())
    Question = batch["input_ids"].to(device)
    
    attention_mask = batch["attention_mask"].to(device)
    true_answers = batch["answer"]
    
    # 推理
    with torch.no_grad():
        outputs = model.generate(Question, attention_mask=attention_mask, max_length=1024)
    
    # 解码生成的答案
    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    # 检查生成的答案与真实答案是否匹配
    for generated_answer, true_answer in zip(generated_answers, true_answers):
        if evaluate_accuracy(generated_answer, true_answer):
            correct += 1
        total += 1

    



tensor([ 2610,   525,   264, 10950,  6888, 17847,    13,  1446,   686,   387,
         2661,   264,  6888,  3491,   323,   498,   686,  4226,   432,    13,
          715, 14582,   549, 41601,   685,  6088, 26111,   311,   220,    19,
           23,   315,  1059,  4780,   304,  5813,    11,   323,  1221,  1340,
         6088,  4279,   438,  1657, 26111,   304,  3217,    13,  2585,  1657,
        26111,  1521, 41601,   685,  4559, 30055,   304,  5813,   323,  3217,
         5267, 16141,   549, 41601,   685,  6088,   220,    19,    23,    14,
           17,   284,  1115,    19,    23,    14,    17,    28,    17,    19,
         2452,    17,    19, 26111,   304,  3217,    13, 41601,   685,  6088,
          220,    19,    23,    10,    17,    19,   284,  1115,    19,    23,
           10,    17,    19,    28,    22,    17,  2452,    22,    17, 26111,
        30055,   304,  5813,   323,  3217,    13,  1096,  4226,   374,   220,
           22,    17,   382, 14582,   549,  3555,   374,   279, 

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## SFT
* 全参数微调

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
train_dataset = GSM8KDataset(ds['train'], tokenizer)
test_dataset = GSM8KDataset(ds['test'], tokenizer)

# 配置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=3,
    eval_steps=3,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 训练
trainer.train()

# 保存模型
trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

