# 基于LLM微调的数学推理任务

In [1]:
import os 
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load
from datasets import load_dataset

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

2024-12-07 20:31:05.651842: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# # 设置代理
# import subprocess
# import os

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

In [6]:
from datasets import load_dataset



test_dataset = load_dataset('json', data_files='./MATH_test.jsonl', split='train')
train_dataset = load_dataset('json', data_files='./MATH_train.jsonl', split='train')



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
def get_dataloader(prompt, batch_size=1):
    def preprocess(dataset, shuffle=True):
        def collate_fn(batch):
            questions =  [[{"role": "user", "content": prompt.format(text=item['problem'])}] for item in batch]
            answers = [item["solution"] for item in batch]
            return questions, answers
        return DataLoader(
            dataset,
            shuffle=shuffle,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
    testset = test_dataset
    return preprocess(testset, shuffle=False)

In [15]:
#  ## setup math_equivalence ： 在math工作目录中 pip install .
# import math_equivalence
from modelscope import AutoModelForCausalLM, AutoTokenizer
import torch

from tqdm import tqdm
import json

def eval_process(model_name, tokenizer_name, prompt, batch_size=1, ratio=1):
    # 检查是否有可用的 GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 加载模型和分词器到 GPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",         # 如果有多个 GPU，可以自动分配
        cache_dir='/autodl-tcachemp/'
    ).to(device)  # 确保模型在 GPU 上

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # 加载数据
    testloader = get_dataloader(prompt, batch_size)
    print(testloader)
    
    answers = []
    replies = []

    # 遍历数据集
    for idx, (texts, truths) in tqdm(enumerate(testloader), total=int(len(testloader) * ratio)):
        if idx >= int(len(testloader) * ratio):
            break

        # 应用模板并生成输入
        texts = [tokenizer.apply_chat_template(
            text,
            tokenize=False,
            add_generation_prompt=True
        ) for text in texts]

        # 将输入张量移到 GPU 上
        model_inputs = tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            padding_side='left'
        ).to(device)

        # 在 GPU 上生成文本
        with torch.no_grad():  # 禁用梯度计算，节省显存
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=256
            )

        # 截取生成的结果
        generated_ids = [
            output_ids[len(input_ids):]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        # 解码生成的结果
        responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        replies.extend(responses)
        answers.extend(truths)
        print(len(replies))
    # 计算正确率
    # total = 0
    # correct = 0
    # for reply, answer in zip(replies, answers):
    #     total += 1
    #     if math_equivalence.is_equiv(reply, answer) :
    #         correct += 1
    # print(f"正确率: {(correct / total) * 100: .2f}%")
    results = [{"answer": a, "reply": r} for a, r in zip(answers, replies)]
    output_file = "results_lora.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print(f"Results saved to {output_file}")

    return replies,answers

In [16]:
model_name = "output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-464"
tokenizer_name = "Qwen/Qwen2.5-0.5B-Instruct"
torch.cuda.empty_cache()
prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
eval_process(model_name, tokenizer_name, prompt, batch_size=32, ratio=1)

Using device: cuda
Downloading Model to directory: /mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2.5-0.5B-Instruct


Downloading [config.json]:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading [configuration.json]:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading [generation_config.json]:   0%|          | 0.00/242 [00:00<?, ?B/s]

Downloading [LICENSE]:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading [merges.txt]:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading [tokenizer.json]:   0%|          | 0.00/6.71M [00:00<?, ?B/s]

Downloading [tokenizer_config.json]:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

Downloading [vocab.json]:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

[INFO:modelscope] Creating symbolic link /mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2___5-0___5B-Instruct -> /mnt/workspace/.cache/modelscope/hub/Qwen/Qwen2.5-0.5B-Instruct.


<torch.utils.data.dataloader.DataLoader object at 0x7f537b6cbf70>


  1%|          | 1/157 [00:17<44:52, 17.26s/it]

32


  1%|▏         | 2/157 [00:37<48:36, 18.82s/it]

64


  2%|▏         | 3/157 [00:54<46:36, 18.16s/it]

96


  3%|▎         | 4/157 [01:14<47:58, 18.82s/it]

128


  3%|▎         | 5/157 [01:30<45:35, 18.00s/it]

160


  4%|▍         | 6/157 [01:49<45:40, 18.15s/it]

192


  4%|▍         | 7/157 [02:05<43:41, 17.48s/it]

224


  5%|▌         | 8/157 [02:21<42:14, 17.01s/it]

256


  6%|▌         | 9/157 [02:37<41:27, 16.80s/it]

288


  6%|▋         | 10/157 [02:54<41:03, 16.76s/it]

320


  7%|▋         | 11/157 [03:10<40:21, 16.59s/it]

352


  8%|▊         | 12/157 [03:37<47:56, 19.83s/it]

384


  8%|▊         | 13/157 [03:54<44:56, 18.73s/it]

416


  9%|▉         | 14/157 [04:10<43:03, 18.06s/it]

448


 10%|▉         | 15/157 [04:36<48:04, 20.32s/it]

480


 10%|█         | 16/157 [04:51<44:24, 18.89s/it]

512


 11%|█         | 17/157 [05:08<42:21, 18.16s/it]

544


 11%|█▏        | 18/157 [05:24<40:43, 17.58s/it]

576


 12%|█▏        | 19/157 [05:51<46:57, 20.42s/it]

608


 13%|█▎        | 20/157 [06:07<43:35, 19.09s/it]

640


 13%|█▎        | 21/157 [06:23<41:15, 18.20s/it]

672


 14%|█▍        | 22/157 [06:41<40:47, 18.13s/it]

704


 15%|█▍        | 23/157 [07:01<41:32, 18.60s/it]

736


 15%|█▌        | 24/157 [07:17<39:45, 17.94s/it]

768


 16%|█▌        | 25/157 [07:40<42:25, 19.29s/it]

800


 17%|█▋        | 26/157 [07:56<40:08, 18.39s/it]

832


 17%|█▋        | 27/157 [08:12<38:33, 17.80s/it]

864


 18%|█▊        | 28/157 [08:29<37:27, 17.42s/it]

896


 18%|█▊        | 29/157 [08:45<36:25, 17.07s/it]

928


 19%|█▉        | 30/157 [09:04<37:13, 17.59s/it]

960


 20%|█▉        | 31/157 [09:21<36:28, 17.37s/it]

992


 20%|██        | 32/157 [09:37<35:36, 17.09s/it]

1024


 21%|██        | 33/157 [09:53<34:46, 16.83s/it]

1056


 22%|██▏       | 34/157 [10:10<34:26, 16.80s/it]

1088


 22%|██▏       | 35/157 [10:26<33:44, 16.60s/it]

1120


 23%|██▎       | 36/157 [10:42<33:09, 16.44s/it]

1152


 24%|██▎       | 37/157 [11:01<34:11, 17.10s/it]

1184


 24%|██▍       | 38/157 [11:17<33:33, 16.92s/it]

1216


 25%|██▍       | 39/157 [11:37<34:35, 17.59s/it]

1248


 25%|██▌       | 40/157 [11:56<35:23, 18.15s/it]

1280


 26%|██▌       | 41/157 [12:21<38:54, 20.13s/it]

1312


 27%|██▋       | 42/157 [12:46<41:33, 21.68s/it]

1344


 27%|██▋       | 43/157 [13:03<38:15, 20.13s/it]

1376


 28%|██▊       | 44/157 [13:24<38:51, 20.64s/it]

1408


 29%|██▊       | 45/157 [13:41<36:06, 19.34s/it]

1440


 29%|██▉       | 46/157 [14:01<36:24, 19.68s/it]

1472


 30%|██▉       | 47/157 [14:19<34:54, 19.04s/it]

1504


 31%|███       | 48/157 [14:57<45:12, 24.89s/it]

1536


 31%|███       | 49/157 [15:16<41:20, 22.97s/it]

1568


 32%|███▏      | 50/157 [15:34<38:20, 21.50s/it]

1600


 32%|███▏      | 51/157 [15:50<35:09, 19.90s/it]

1632


 33%|███▎      | 52/157 [16:07<33:22, 19.07s/it]

1664


 34%|███▍      | 53/157 [16:30<34:44, 20.04s/it]

1696


 34%|███▍      | 54/157 [16:48<33:35, 19.57s/it]

1728


 35%|███▌      | 55/157 [17:06<32:14, 18.96s/it]

1760


 36%|███▌      | 56/157 [17:24<31:46, 18.88s/it]

1792


 36%|███▋      | 57/157 [17:43<31:14, 18.75s/it]

1824


 37%|███▋      | 58/157 [18:03<31:32, 19.12s/it]

1856


 38%|███▊      | 59/157 [18:35<37:52, 23.19s/it]

1888


 38%|███▊      | 60/157 [18:55<35:47, 22.14s/it]

1920


 39%|███▉      | 61/157 [19:18<36:02, 22.52s/it]

1952


 39%|███▉      | 62/157 [19:36<33:28, 21.15s/it]

1984


 40%|████      | 63/157 [19:57<32:50, 20.96s/it]

2016


 41%|████      | 64/157 [20:16<31:37, 20.40s/it]

2048


 41%|████▏     | 65/157 [20:34<30:19, 19.78s/it]

2080


 42%|████▏     | 66/157 [20:54<30:01, 19.80s/it]

2112


 43%|████▎     | 67/157 [21:14<29:39, 19.77s/it]

2144


 43%|████▎     | 68/157 [21:43<33:31, 22.60s/it]

2176


 44%|████▍     | 69/157 [22:00<30:31, 20.81s/it]

2208


 45%|████▍     | 70/157 [22:16<28:23, 19.59s/it]

2240


 45%|████▌     | 71/157 [22:33<26:43, 18.65s/it]

2272


 46%|████▌     | 72/157 [22:50<25:38, 18.10s/it]

2304


 46%|████▋     | 73/157 [23:06<24:42, 17.65s/it]

2336


 47%|████▋     | 74/157 [23:30<27:06, 19.59s/it]

2368


 48%|████▊     | 75/157 [23:48<25:59, 19.02s/it]

2400


 48%|████▊     | 76/157 [24:05<24:47, 18.36s/it]

2432


 49%|████▉     | 77/157 [24:22<23:54, 17.93s/it]

2464


 50%|████▉     | 78/157 [24:44<25:07, 19.08s/it]

2496


 50%|█████     | 79/157 [25:00<23:54, 18.40s/it]

2528


 51%|█████     | 80/157 [25:17<22:43, 17.71s/it]

2560


 52%|█████▏    | 81/157 [25:45<26:26, 20.87s/it]

2592


 52%|█████▏    | 82/157 [26:05<25:50, 20.67s/it]

2624


 53%|█████▎    | 83/157 [26:34<28:38, 23.22s/it]

2656


 54%|█████▎    | 84/157 [26:50<25:41, 21.11s/it]

2688


 54%|█████▍    | 85/157 [27:07<23:33, 19.64s/it]

2720


 55%|█████▍    | 86/157 [27:24<22:23, 18.93s/it]

2752


 55%|█████▌    | 87/157 [27:40<21:13, 18.19s/it]

2784


 56%|█████▌    | 88/157 [27:57<20:26, 17.77s/it]

2816


 57%|█████▋    | 89/157 [28:16<20:31, 18.11s/it]

2848


 57%|█████▋    | 90/157 [28:36<20:48, 18.63s/it]

2880


 58%|█████▊    | 91/157 [28:53<19:59, 18.17s/it]

2912


 59%|█████▊    | 92/157 [29:09<19:03, 17.59s/it]

2944


 59%|█████▉    | 93/157 [29:26<18:29, 17.33s/it]

2976


 60%|█████▉    | 94/157 [29:44<18:20, 17.47s/it]

3008


 61%|██████    | 95/157 [30:02<18:12, 17.63s/it]

3040


 61%|██████    | 96/157 [30:20<18:04, 17.78s/it]

3072


 62%|██████▏   | 97/157 [30:37<17:42, 17.70s/it]

3104


 62%|██████▏   | 98/157 [30:54<17:01, 17.31s/it]

3136


 63%|██████▎   | 99/157 [31:10<16:24, 16.97s/it]

3168


 64%|██████▎   | 100/157 [31:26<15:52, 16.71s/it]

3200


 64%|██████▍   | 101/157 [31:42<15:24, 16.50s/it]

3232


 65%|██████▍   | 102/157 [31:58<14:59, 16.36s/it]

3264


 66%|██████▌   | 103/157 [32:14<14:34, 16.19s/it]

3296


 66%|██████▌   | 104/157 [32:30<14:15, 16.14s/it]

3328


 67%|██████▋   | 105/157 [32:46<13:52, 16.02s/it]

3360


 68%|██████▊   | 106/157 [33:02<13:47, 16.22s/it]

3392


 68%|██████▊   | 107/157 [33:19<13:35, 16.30s/it]

3424


 69%|██████▉   | 108/157 [33:35<13:21, 16.35s/it]

3456


 69%|██████▉   | 109/157 [33:52<13:04, 16.34s/it]

3488


 70%|███████   | 110/157 [34:07<12:42, 16.21s/it]

3520


 71%|███████   | 111/157 [34:24<12:30, 16.32s/it]

3552


 71%|███████▏  | 112/157 [34:40<12:09, 16.22s/it]

3584


 72%|███████▏  | 113/157 [35:05<13:49, 18.86s/it]

3616


 73%|███████▎  | 114/157 [35:17<12:04, 16.85s/it]

3648


 73%|███████▎  | 115/157 [35:34<11:42, 16.72s/it]

3680


 74%|███████▍  | 116/157 [35:51<11:35, 16.96s/it]

3712


 75%|███████▍  | 117/157 [36:11<11:52, 17.81s/it]

3744


 75%|███████▌  | 118/157 [36:29<11:34, 17.81s/it]

3776


 76%|███████▌  | 119/157 [36:48<11:33, 18.26s/it]

3808


 76%|███████▋  | 120/157 [37:09<11:45, 19.06s/it]

3840


 77%|███████▋  | 121/157 [37:26<11:07, 18.53s/it]

3872


 78%|███████▊  | 122/157 [37:43<10:33, 18.10s/it]

3904


 78%|███████▊  | 123/157 [38:00<10:00, 17.65s/it]

3936


 79%|███████▉  | 124/157 [38:16<09:24, 17.10s/it]

3968


 80%|███████▉  | 125/157 [38:34<09:20, 17.52s/it]

4000


 80%|████████  | 126/157 [38:52<09:05, 17.59s/it]

4032


 81%|████████  | 127/157 [39:08<08:36, 17.22s/it]

4064


 82%|████████▏ | 128/157 [39:26<08:23, 17.36s/it]

4096


 82%|████████▏ | 129/157 [39:43<08:04, 17.31s/it]

4128


 83%|████████▎ | 130/157 [40:03<08:03, 17.91s/it]

4160


 83%|████████▎ | 131/157 [40:24<08:11, 18.90s/it]

4192


 84%|████████▍ | 132/157 [40:43<07:51, 18.87s/it]

4224


 85%|████████▍ | 133/157 [40:59<07:15, 18.13s/it]

4256


 85%|████████▌ | 134/157 [41:17<06:56, 18.11s/it]

4288


 86%|████████▌ | 135/157 [41:34<06:29, 17.71s/it]

4320


 87%|████████▋ | 136/157 [41:50<06:02, 17.28s/it]

4352


 87%|████████▋ | 137/157 [42:09<05:56, 17.82s/it]

4384


 88%|████████▊ | 138/157 [42:38<06:41, 21.12s/it]

4416


 89%|████████▊ | 139/157 [42:58<06:14, 20.80s/it]

4448


 89%|████████▉ | 140/157 [43:22<06:09, 21.72s/it]

4480


 90%|████████▉ | 141/157 [43:39<05:26, 20.39s/it]

4512


 90%|█████████ | 142/157 [43:56<04:51, 19.42s/it]

4544


 91%|█████████ | 143/157 [44:14<04:26, 19.00s/it]

4576


 92%|█████████▏| 144/157 [44:31<03:56, 18.16s/it]

4608


 92%|█████████▏| 145/157 [44:48<03:33, 17.82s/it]

4640


 93%|█████████▎| 146/157 [45:05<03:13, 17.56s/it]

4672


 94%|█████████▎| 147/157 [45:21<02:52, 17.27s/it]

4704


 94%|█████████▍| 148/157 [45:38<02:34, 17.14s/it]

4736


 95%|█████████▍| 149/157 [45:56<02:19, 17.38s/it]

4768


 96%|█████████▌| 150/157 [46:13<02:01, 17.32s/it]

4800


 96%|█████████▌| 151/157 [46:29<01:41, 16.93s/it]

4832


 97%|█████████▋| 152/157 [46:45<01:22, 16.52s/it]

4864


 97%|█████████▋| 153/157 [47:01<01:05, 16.50s/it]

4896


 98%|█████████▊| 154/157 [47:19<00:50, 16.83s/it]

4928


 99%|█████████▊| 155/157 [47:37<00:34, 17.11s/it]

4960


 99%|█████████▉| 156/157 [47:54<00:17, 17.10s/it]

4992


100%|██████████| 157/157 [48:09<00:00, 18.40s/it]

5000
Results saved to results_lora.json





(['We can rewrite the denominator as $(x+3)(x-2)$.  Thus, there are no vertical asymptotes where $x= \\pm 3$ or $x=-2$, so we look at the factors that are not equal to zero: $(x+2)$ and $(x-3)$.  These factors correspond to two vertical asymptotes.  Therefore, the number of vertical asymptotes is $\\boxed{2}$.',
  'First we calculate $120\\%$ of 30: \\begin{align*}\n120\\%&=120\\div100\\\\\n&=\\frac{6}{5}\\\\\n&=1.2\\\\\n&=120\\%\\text{ of }30\\\\\n&=1.2\\times30\\\\\n&=36.\n\\end{align*}Next we calculate $130\\%$ of 20: \\begin{align*}\n130\\%&=130\\div100\\\\\n&=\\frac{7}{5}\\\\\n&=\\frac{35}{25}\\\\\n&=\\frac{7}{5}\\times\\frac{25}{25}\\\\\n&=\\frac{175}{125}\\\\\n&=\\frac{7}{5}\\times\\frac{25}{25}\\\\\n&=\\frac{7}{5}\\times4\\\\\n&=8.\n\\end{align*}Finally, we find the positive difference between 36 and 8: \\begin{align*}\n36-8&=28.\n\\end{align',
  'Since $\\lceil x\\rceil$ represents the smallest integer greater than or equal to $x$, we have $x - \\lceil x\\rceil = \\dfrac{23}{7

In [9]:
# 导入swift框架进行微调
from swift.llm import (
    DatasetName, InferArguments, ModelType, SftArguments,
    infer_main, sft_main, app_ui_main
)

[INFO:swift] Successfully registered `/usr/local/lib/python3.10/site-packages/swift/llm/data/dataset_info.json`


In [10]:
model_type = ModelType.qwen2_5_0_5b_instruct
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())
sft_args = SftArguments(
    model_type=model_type,
    dataset=['MATH_train_fineture.jsonl'],
    output_dir='output_lora',
    max_length=4096,
    # sft_type = SftType.full,
    
    )
result = sft_main(sft_args)
last_model_checkpoint = result['last_model_checkpoint']

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

[INFO:swift] Setting template_type: qwen2_5
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1
[INFO:swift] output_dir: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247
[INFO:swift] Start time of running main: 2024-12-07 20:32:47.412100
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b-instruct', model_id_or_path='qwen/Qwen2.5-0.5B-Instruct', model_revision='master', full_determinism=False, sft_type='lora', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='qwen2_5', output_dir='/mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint=None, resume_only_model=False, ignore_data_skip=False, dtype='bf16', packing=False, train_backend='transformers', tp=1, pp=1

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1
Downloading Model to directory: /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2.5-0.5B-Instruct




Downloading [config.json]:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading [configuration.json]:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading [generation_config.json]:   0%|          | 0.00/242 [00:00<?, ?B/s]

Downloading [LICENSE]:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading [merges.txt]:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading [model.safetensors]:   0%|          | 0.00/942M [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading [tokenizer.json]:   0%|          | 0.00/6.71M [00:00<?, ?B/s]

Downloading [tokenizer_config.json]:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

Downloading [vocab.json]:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

[INFO:modelscope] Creating symbolic link /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct -> /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2.5-0.5B-Instruct.
[INFO:swift] Loading the model using model_dir: /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 32768
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

[INFO:swift] train_dataset: Dataset({
    features: ['query', 'response'],
    num_rows: 7425
})
[INFO:swift] val_dataset: Dataset({
    features: ['query', 'response'],
    num_rows: 75
})
[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465, 553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 2610, 525, 264, 7548, 25530, 20976, 1103, 12875, 315, 21828, 6351, 11972, 34689, 6888, 5322, 3019, 553, 3019, 13, 5209, 1349, 279, 3491, 15516, 323, 11625, 432, 448, 2797, 32711, 13, 11112, 1493, 11221, 1447, 12, 64547, 279, 3881, 1995, 323, 279, 3405, 1660, 4588, 624, 12, 15623, 279, 6291, 1119, 19819, 7354, 11, 8241, 2797, 40841, 369, 1817, 624, 12, 6928, 678, 28117, 323, 28439, 3059, 624, 12, 1200, 857, 448, 264, 1590, 4226, 382, 5097, 697, 2033, 304, 279, 2701, 3561, 510, 58, 69769, 323, 28117, 921, 820, 508, 19357, 34776, 4226, 60, 33933, 8420, 374, 279, 3491, 510, 4340, 1657, 6785, 9363, 315, 220, 24, 21, 525, 1083, 65060, 315, 2

Map:   0%|          | 0/7425 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ?it/s]

[INFO:swift] Dataset Token Length: 434.309899±249.057953, min=163.000000, max=2849.000000, size=7425
[INFO:swift] Dataset Token Length: 441.853333±276.012497, min=182.000000, max=1915.000000, size=75
[INFO:swift] training_args: Seq2SeqTrainingArguments(
_n_gpu=1,
acc_strategy=token,
accelerator_config={'split_batches': False, 'dispatch_batches': False, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_epsilon=1e-08,
additional_saved_files=[],
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=42,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeou

[2024-12-07 20:33:27,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  super().__init__(
df: /root/.triton/autotune: 没有那个文件或目录
[INFO:swift] The SftArguments will be saved in: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/sft_args.json
[INFO:swift] The Seq2SeqTrainingArguments will be saved in: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/training_args.json
[INFO:swift] The logging file will be saved in: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/logging.jsonl


Train:   0%|          | 0/464 [00:00<?, ?it/s]

{'loss': 0.91355658, 'acc': 0.74084908, 'grad_norm': 2.38017082, 'learning_rate': 4.17e-06, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.136182, 'epoch': 0.0, 'global_step/max_steps': '1/464', 'percentage': '0.22%', 'elapsed_time': '6s', 'remaining_time': '51m 31s'}
{'loss': 0.81158853, 'acc': 0.77031773, 'grad_norm': 2.14467955, 'learning_rate': 2.083e-05, 'memory(GiB)': 7.78, 'train_speed(iter/s)': 0.145675, 'epoch': 0.01, 'global_step/max_steps': '5/464', 'percentage': '1.08%', 'elapsed_time': '33s', 'remaining_time': '51m 29s'}
{'loss': 0.72963877, 'acc': 0.77633357, 'grad_norm': 1.33716571, 'learning_rate': 4.167e-05, 'memory(GiB)': 13.98, 'train_speed(iter/s)': 0.146118, 'epoch': 0.02, 'global_step/max_steps': '10/464', 'percentage': '2.16%', 'elapsed_time': '1m 7s', 'remaining_time': '51m 16s'}
{'loss': 0.85885458, 'acc': 0.75902586, 'grad_norm': 1.18980491, 'learning_rate': 6.25e-05, 'memory(GiB)': 13.98, 'train_speed(iter/s)': 0.146984, 'epoch': 0.03, 'global_step/max_steps':

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-50


{'eval_loss': 0.73037171, 'eval_acc': 0.81223889, 'eval_runtime': 11.3508, 'eval_samples_per_second': 6.607, 'eval_steps_per_second': 6.607, 'epoch': 0.11, 'global_step/max_steps': '50/464', 'percentage': '10.78%', 'elapsed_time': '5m 59s', 'remaining_time': '49m 33s'}
{'loss': 0.77352223, 'acc': 0.78892064, 'grad_norm': 0.72599179, 'learning_rate': 9.878e-05, 'memory(GiB)': 10.47, 'train_speed(iter/s)': 0.139922, 'epoch': 0.12, 'global_step/max_steps': '55/464', 'percentage': '11.85%', 'elapsed_time': '6m 32s', 'remaining_time': '48m 38s'}
{'loss': 0.71456051, 'acc': 0.79539809, 'grad_norm': 0.51820445, 'learning_rate': 9.836e-05, 'memory(GiB)': 10.47, 'train_speed(iter/s)': 0.140646, 'epoch': 0.13, 'global_step/max_steps': '60/464', 'percentage': '12.93%', 'elapsed_time': '7m 5s', 'remaining_time': '47m 47s'}
{'loss': 0.72607727, 'acc': 0.78688092, 'grad_norm': 0.68249834, 'learning_rate': 9.787e-05, 'memory(GiB)': 10.47, 'train_speed(iter/s)': 0.141185, 'epoch': 0.14, 'global_step/m

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-100


{'eval_loss': 0.72036463, 'eval_acc': 0.81565166, 'eval_runtime': 11.4088, 'eval_samples_per_second': 6.574, 'eval_steps_per_second': 6.574, 'epoch': 0.22, 'global_step/max_steps': '100/464', 'percentage': '21.55%', 'elapsed_time': '11m 45s', 'remaining_time': '42m 49s'}
{'loss': 0.79397178, 'acc': 0.78052149, 'grad_norm': 0.57874644, 'learning_rate': 9.187e-05, 'memory(GiB)': 6.12, 'train_speed(iter/s)': 0.141801, 'epoch': 0.23, 'global_step/max_steps': '105/464', 'percentage': '22.63%', 'elapsed_time': '12m 19s', 'remaining_time': '42m 9s'}
{'loss': 0.7426538, 'acc': 0.77643104, 'grad_norm': 0.66820115, 'learning_rate': 9.087e-05, 'memory(GiB)': 6.12, 'train_speed(iter/s)': 0.141575, 'epoch': 0.24, 'global_step/max_steps': '110/464', 'percentage': '23.71%', 'elapsed_time': '12m 56s', 'remaining_time': '41m 38s'}
{'loss': 0.68767738, 'acc': 0.78864865, 'grad_norm': 0.61255258, 'learning_rate': 8.981e-05, 'memory(GiB)': 6.12, 'train_speed(iter/s)': 0.1416, 'epoch': 0.25, 'global_step/m

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-150


{'eval_loss': 0.71641868, 'eval_acc': 0.81512209, 'eval_runtime': 11.3331, 'eval_samples_per_second': 6.618, 'eval_steps_per_second': 6.618, 'epoch': 0.32, 'global_step/max_steps': '150/464', 'percentage': '32.33%', 'elapsed_time': '17m 36s', 'remaining_time': '36m 51s'}
{'loss': 0.74061103, 'acc': 0.78433986, 'grad_norm': 0.7693091, 'learning_rate': 7.968e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142103, 'epoch': 0.33, 'global_step/max_steps': '155/464', 'percentage': '33.41%', 'elapsed_time': '18m 10s', 'remaining_time': '36m 13s'}
{'loss': 0.74071627, 'acc': 0.78340116, 'grad_norm': 0.79010719, 'learning_rate': 7.822e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142316, 'epoch': 0.34, 'global_step/max_steps': '160/464', 'percentage': '34.48%', 'elapsed_time': '18m 43s', 'remaining_time': '35m 34s'}
{'loss': 0.78152738, 'acc': 0.78044887, 'grad_norm': 0.53151244, 'learning_rate': 7.673e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142406, 'epoch': 0.36, 'global_ste

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-200


{'eval_loss': 0.71333408, 'eval_acc': 0.81565166, 'eval_runtime': 11.3495, 'eval_samples_per_second': 6.608, 'eval_steps_per_second': 6.608, 'epoch': 0.43, 'global_step/max_steps': '200/464', 'percentage': '43.10%', 'elapsed_time': '23m 25s', 'remaining_time': '30m 55s'}
{'loss': 0.68194056, 'acc': 0.80190716, 'grad_norm': 0.51206481, 'learning_rate': 6.374e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142411, 'epoch': 0.44, 'global_step/max_steps': '205/464', 'percentage': '44.18%', 'elapsed_time': '23m 58s', 'remaining_time': '30m 17s'}
{'loss': 0.76752372, 'acc': 0.7783987, 'grad_norm': 0.74560529, 'learning_rate': 6.202e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142678, 'epoch': 0.45, 'global_step/max_steps': '210/464', 'percentage': '45.26%', 'elapsed_time': '24m 31s', 'remaining_time': '29m 39s'}
{'loss': 0.68575902, 'acc': 0.78837314, 'grad_norm': 0.50078559, 'learning_rate': 6.028e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142477, 'epoch': 0.46, 'global_ste

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-250


{'eval_loss': 0.71235025, 'eval_acc': 0.81547514, 'eval_runtime': 11.3845, 'eval_samples_per_second': 6.588, 'eval_steps_per_second': 6.588, 'epoch': 0.54, 'global_step/max_steps': '250/464', 'percentage': '53.88%', 'elapsed_time': '29m 17s', 'remaining_time': '25m 4s'}
{'loss': 0.75194349, 'acc': 0.77914896, 'grad_norm': 0.63546473, 'learning_rate': 4.608e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142215, 'epoch': 0.55, 'global_step/max_steps': '255/464', 'percentage': '54.96%', 'elapsed_time': '29m 52s', 'remaining_time': '24m 29s'}
{'loss': 0.68028584, 'acc': 0.78044305, 'grad_norm': 0.54739058, 'learning_rate': 4.43e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142369, 'epoch': 0.56, 'global_step/max_steps': '260/464', 'percentage': '56.03%', 'elapsed_time': '30m 25s', 'remaining_time': '23m 52s'}
{'loss': 0.68942895, 'acc': 0.79251385, 'grad_norm': 0.60076696, 'learning_rate': 4.253e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142438, 'epoch': 0.57, 'global_step

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-300


{'eval_loss': 0.71081698, 'eval_acc': 0.81582818, 'eval_runtime': 11.3441, 'eval_samples_per_second': 6.611, 'eval_steps_per_second': 6.611, 'epoch': 0.65, 'global_step/max_steps': '300/464', 'percentage': '64.66%', 'elapsed_time': '35m 5s', 'remaining_time': '19m 11s'}
{'loss': 0.70843954, 'acc': 0.7848114, 'grad_norm': 0.45110521, 'learning_rate': 2.891e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142465, 'epoch': 0.66, 'global_step/max_steps': '305/464', 'percentage': '65.73%', 'elapsed_time': '35m 40s', 'remaining_time': '18m 35s'}
{'loss': 0.73362336, 'acc': 0.78738465, 'grad_norm': 0.91797727, 'learning_rate': 2.73e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142547, 'epoch': 0.67, 'global_step/max_steps': '310/464', 'percentage': '66.81%', 'elapsed_time': '36m 14s', 'remaining_time': '18m 0s'}
{'loss': 0.69626832, 'acc': 0.7963336, 'grad_norm': 0.54932636, 'learning_rate': 2.572e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142794, 'epoch': 0.68, 'global_step/ma

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-350


{'eval_loss': 0.71001202, 'eval_acc': 0.81676964, 'eval_runtime': 11.3381, 'eval_samples_per_second': 6.615, 'eval_steps_per_second': 6.615, 'epoch': 0.75, 'global_step/max_steps': '350/464', 'percentage': '75.43%', 'elapsed_time': '40m 52s', 'remaining_time': '13m 18s'}
{'loss': 0.72513366, 'acc': 0.78516135, 'grad_norm': 0.68620241, 'learning_rate': 1.439e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142664, 'epoch': 0.76, 'global_step/max_steps': '355/464', 'percentage': '76.51%', 'elapsed_time': '41m 27s', 'remaining_time': '12m 43s'}
{'loss': 0.71346474, 'acc': 0.79310122, 'grad_norm': 0.55806381, 'learning_rate': 1.316e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142662, 'epoch': 0.78, 'global_step/max_steps': '360/464', 'percentage': '77.59%', 'elapsed_time': '42m 2s', 'remaining_time': '12m 8s'}
{'loss': 0.76218324, 'acc': 0.77003689, 'grad_norm': 0.45754746, 'learning_rate': 1.198e-05, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142788, 'epoch': 0.79, 'global_step

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-400


{'eval_loss': 0.70959073, 'eval_acc': 0.81653427, 'eval_runtime': 11.3457, 'eval_samples_per_second': 6.61, 'eval_steps_per_second': 6.61, 'epoch': 0.86, 'global_step/max_steps': '400/464', 'percentage': '86.21%', 'elapsed_time': '46m 41s', 'remaining_time': '7m 28s'}
{'loss': 0.67232885, 'acc': 0.79489188, 'grad_norm': 0.5494017, 'learning_rate': 4.37e-06, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142825, 'epoch': 0.87, 'global_step/max_steps': '405/464', 'percentage': '87.28%', 'elapsed_time': '47m 14s', 'remaining_time': '6m 52s'}
{'loss': 0.74536309, 'acc': 0.78196411, 'grad_norm': 0.7172007, 'learning_rate': 3.67e-06, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.14277, 'epoch': 0.88, 'global_step/max_steps': '410/464', 'percentage': '88.36%', 'elapsed_time': '47m 51s', 'remaining_time': '6m 18s'}
{'loss': 0.74554377, 'acc': 0.78539166, 'grad_norm': 0.73661417, 'learning_rate': 3.03e-06, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142827, 'epoch': 0.89, 'global_step/max_step

Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-450


{'eval_loss': 0.70992064, 'eval_acc': 0.81676964, 'eval_runtime': 11.301, 'eval_samples_per_second': 6.637, 'eval_steps_per_second': 6.637, 'epoch': 0.97, 'global_step/max_steps': '450/464', 'percentage': '96.98%', 'elapsed_time': '52m 29s', 'remaining_time': '1m 37s'}
{'loss': 0.73779354, 'acc': 0.77980127, 'grad_norm': 0.51511723, 'learning_rate': 1e-07, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.142879, 'epoch': 0.98, 'global_step/max_steps': '455/464', 'percentage': '98.06%', 'elapsed_time': '53m 3s', 'remaining_time': '1m 2s'}
{'loss': 0.74050436, 'acc': 0.7876061, 'grad_norm': 0.52593786, 'learning_rate': 2e-08, 'memory(GiB)': 6.13, 'train_speed(iter/s)': 0.143038, 'epoch': 0.99, 'global_step/max_steps': '460/464', 'percentage': '99.14%', 'elapsed_time': '53m 35s', 'remaining_time': '27s'}



Val:   0%|          | 0/75 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-464
[INFO:swift] last_model_checkpoint: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-464


{'eval_loss': 0.70946109, 'eval_acc': 0.81659312, 'eval_runtime': 11.2344, 'eval_samples_per_second': 6.676, 'eval_steps_per_second': 6.676, 'epoch': 1.0, 'global_step/max_steps': '464/464', 'percentage': '100.00%', 'elapsed_time': '54m 14s', 'remaining_time': '0s'}
{'train_runtime': 3254.2059, 'train_samples_per_second': 2.282, 'train_steps_per_second': 0.143, 'train_loss': 0.73837142, 'epoch': 1.0, 'global_step/max_steps': '464/464', 'percentage': '100.00%', 'elapsed_time': '54m 14s', 'remaining_time': '0s'}


[INFO:swift] best_model_checkpoint: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/checkpoint-464
[INFO:swift] images_dir: /mnt/workspace/output_lora/qwen2_5-0_5b-instruct/v0-20241207-203247/images
[INFO:swift] End time of running main: 2024-12-07 21:27:49.194640
