# 基于LLM微调的数学推理任务

In [1]:
import os 
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load
from datasets import load_dataset

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

2024-12-07 17:00:22.043479: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# # 设置代理
# import subprocess
# import os

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

In [3]:
from datasets import load_dataset


test_dataset = load_dataset('json', data_files='./MATH_test.jsonl',split='train')
train_dataset = load_dataset('json', data_files='./MATH_train.jsonl', split='train')



In [4]:
def get_dataloader(prompt, batch_size=1):
    def preprocess(dataset, shuffle=True):
        def collate_fn(batch):
            questions =  [[{"role": "user", "content": prompt.format(text=item['problem'])}] for item in batch]
            answers = [item["solution"] for item in batch]
            return questions, answers
        return DataLoader(
            dataset,
            shuffle=shuffle,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
    testset = test_dataset
    return preprocess(testset, shuffle=False)

In [22]:
 ## setup math_equivalence ： 在math工作目录中 pip install .
import math_equivalence
from modelscope import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import json

def eval_process(model_name, tokenizer_name, prompt, batch_size=1, ratio=1):
    # 检查是否有可用的 GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 加载模型和分词器到 GPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",         # 如果有多个 GPU，可以自动分配
        cache_dir='/autodl-tcachemp/'
    ).to(device)  # 确保模型在 GPU 上

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # 加载数据
    testloader = get_dataloader(prompt, batch_size)
    print(testloader)
    
    answers = []
    replies = []

    # 遍历数据集
    for idx, (texts, truths) in tqdm(enumerate(testloader), total=int(len(testloader) * ratio)):
        if idx >= int(len(testloader) * ratio):
            break

        # 应用模板并生成输入
        texts = [tokenizer.apply_chat_template(
            text,
            tokenize=False,
            add_generation_prompt=True
        ) for text in texts]

        # 将输入张量移到 GPU 上
        model_inputs = tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            padding_side='left'
        ).to(device)

        # 在 GPU 上生成文本
        with torch.no_grad():  # 禁用梯度计算，节省显存
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=256
            )

        # 截取生成的结果
        generated_ids = [
            output_ids[len(input_ids):]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        # 解码生成的结果
        responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        replies.extend(responses)
        answers.extend(truths)
        print(len(replies))
    # 计算正确率
    # total = 0
    # correct = 0
    # for reply, answer in zip(replies, answers):
    #     total += 1
    #     if math_equivalence.is_equiv(reply, answer) :
    #         correct += 1
    # print(f"正确率: {(correct / total) * 100: .2f}%")
    results = [{"answer": a, "reply": r} for a, r in zip(answers, replies)]
    output_file = "results.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print(f"Results saved to {output_file}")

    return replies,answers


In [23]:
model_name = "output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-464"
tokenizer_name = "Qwen/Qwen2.5-0.5B-Instruct"

prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
replies,answers = eval_process(model_name, tokenizer_name, prompt, batch_size=32, ratio=1)

Using device: cuda
<torch.utils.data.dataloader.DataLoader object at 0x7f82cf997c40>


  1%|          | 1/157 [00:15<40:16, 15.49s/it]

32


  1%|▏         | 2/157 [00:34<44:49, 17.35s/it]

64


  2%|▏         | 3/157 [00:49<42:31, 16.57s/it]

96


  3%|▎         | 4/157 [01:08<44:15, 17.36s/it]

128


  3%|▎         | 5/157 [01:22<40:59, 16.18s/it]

160


  4%|▍         | 6/157 [01:39<41:20, 16.43s/it]

192


  4%|▍         | 7/157 [01:51<37:56, 15.17s/it]

224


  5%|▌         | 8/157 [02:04<35:55, 14.47s/it]

256


  6%|▌         | 9/157 [02:17<34:22, 13.94s/it]

288


  6%|▋         | 10/157 [02:30<33:25, 13.64s/it]

320


  7%|▋         | 11/157 [02:43<32:34, 13.39s/it]

352


  8%|▊         | 12/157 [03:09<41:36, 17.22s/it]

384


  8%|▊         | 13/157 [03:22<38:31, 16.05s/it]

416


  9%|▉         | 14/157 [03:36<36:26, 15.29s/it]

448


 10%|▉         | 15/157 [04:00<42:36, 18.00s/it]

480


 10%|█         | 16/157 [04:13<38:37, 16.44s/it]

512


 11%|█         | 17/157 [04:27<36:43, 15.74s/it]

544


 11%|█▏        | 18/157 [04:40<34:30, 14.90s/it]

576


 12%|█▏        | 19/157 [05:06<41:48, 18.18s/it]

608


 13%|█▎        | 20/157 [05:19<37:54, 16.60s/it]

640


 13%|█▎        | 21/157 [05:31<34:57, 15.42s/it]

672


 14%|█▍        | 22/157 [05:48<35:20, 15.71s/it]

704


 15%|█▍        | 23/157 [06:06<36:56, 16.54s/it]

736


 15%|█▌        | 24/157 [06:19<34:24, 15.53s/it]

768


 16%|█▌        | 25/157 [06:41<37:53, 17.23s/it]

800


 17%|█▋        | 26/157 [06:54<34:53, 15.98s/it]

832


 17%|█▋        | 27/157 [07:08<33:19, 15.38s/it]

864


 18%|█▊        | 28/157 [07:21<31:32, 14.67s/it]

896


 18%|█▊        | 29/157 [07:34<30:33, 14.33s/it]

928


 19%|█▉        | 30/157 [07:52<32:15, 15.24s/it]

960


 20%|█▉        | 31/157 [08:05<30:49, 14.68s/it]

992


 20%|██        | 32/157 [08:18<29:31, 14.17s/it]

1024


 21%|██        | 33/157 [08:31<28:45, 13.91s/it]

1056


 22%|██▏       | 34/157 [08:46<28:59, 14.14s/it]

1088


 22%|██▏       | 35/157 [08:59<27:57, 13.75s/it]

1120


 23%|██▎       | 36/157 [09:12<27:11, 13.48s/it]

1152


 24%|██▎       | 37/157 [09:29<29:14, 14.62s/it]

1184


 24%|██▍       | 38/157 [09:43<28:24, 14.32s/it]

1216


 25%|██▍       | 39/157 [10:00<30:18, 15.41s/it]

1248


 25%|██▌       | 40/157 [10:19<31:42, 16.26s/it]

1280


 26%|██▌       | 41/157 [10:42<35:36, 18.42s/it]

1312


 27%|██▋       | 42/157 [11:06<38:34, 20.12s/it]

1344


 27%|██▋       | 43/157 [11:20<34:28, 18.15s/it]

1376


 28%|██▊       | 44/157 [11:40<35:27, 18.82s/it]

1408


 29%|██▊       | 45/157 [11:54<32:05, 17.19s/it]

1440


 29%|██▉       | 46/157 [12:13<32:57, 17.82s/it]

1472


 30%|██▉       | 47/157 [12:29<31:33, 17.21s/it]

1504


 31%|███       | 48/157 [13:06<42:09, 23.20s/it]

1536


 31%|███       | 49/157 [13:23<38:25, 21.34s/it]

1568


 32%|███▏      | 50/157 [13:39<35:32, 19.93s/it]

1600


 32%|███▏      | 51/157 [13:53<31:48, 18.00s/it]

1632


 33%|███▎      | 52/157 [14:08<30:00, 17.15s/it]

1664


 34%|███▍      | 53/157 [14:29<31:47, 18.34s/it]

1696


 34%|███▍      | 54/157 [14:47<30:55, 18.01s/it]

1728


 35%|███▌      | 55/157 [15:02<29:27, 17.33s/it]

1760


 36%|███▌      | 56/157 [15:20<29:15, 17.38s/it]

1792


 36%|███▋      | 57/157 [15:37<28:50, 17.30s/it]

1824


 37%|███▋      | 58/157 [15:56<29:17, 17.76s/it]

1856


 38%|███▊      | 59/157 [16:27<35:40, 21.85s/it]

1888


 38%|███▊      | 60/157 [16:46<33:40, 20.83s/it]

1920


 39%|███▉      | 61/157 [17:08<33:58, 21.24s/it]

1952


 39%|███▉      | 62/157 [17:24<31:13, 19.72s/it]

1984


 40%|████      | 63/157 [17:43<30:40, 19.58s/it]

2016


 41%|████      | 64/157 [18:01<29:34, 19.08s/it]

2048


 41%|████▏     | 65/157 [18:18<28:20, 18.48s/it]

2080


 42%|████▏     | 66/157 [18:37<28:05, 18.52s/it]

2112


 43%|████▎     | 67/157 [18:55<27:44, 18.50s/it]

2144


 43%|████▎     | 68/157 [19:23<31:39, 21.34s/it]

2176


 44%|████▍     | 69/157 [19:37<27:56, 19.05s/it]

2208


 45%|████▍     | 70/157 [19:52<25:49, 17.81s/it]

2240


 45%|████▌     | 71/157 [20:06<23:54, 16.68s/it]

2272


 46%|████▌     | 72/157 [20:21<22:53, 16.16s/it]

2304


 46%|████▋     | 73/157 [20:35<21:44, 15.52s/it]

2336


 47%|████▋     | 74/157 [20:58<24:33, 17.76s/it]

2368


 48%|████▊     | 75/157 [21:14<23:38, 17.30s/it]

2400


 48%|████▊     | 76/157 [21:29<22:24, 16.60s/it]

2432


 49%|████▉     | 77/157 [21:44<21:25, 16.07s/it]

2464


 50%|████▉     | 78/157 [22:04<22:54, 17.40s/it]

2496


 50%|█████     | 79/157 [22:19<21:35, 16.61s/it]

2528


 51%|█████     | 80/157 [22:32<19:55, 15.53s/it]

2560


 52%|█████▏    | 81/157 [22:59<24:02, 18.97s/it]

2592


 52%|█████▏    | 82/157 [23:18<23:41, 18.95s/it]

2624


 53%|█████▎    | 83/157 [23:46<26:41, 21.65s/it]

2656


 54%|█████▎    | 84/157 [23:59<23:20, 19.18s/it]

2688


 54%|█████▍    | 85/157 [24:12<20:47, 17.33s/it]

2720


 55%|█████▍    | 86/157 [24:28<19:51, 16.78s/it]

2752


 55%|█████▌    | 87/157 [24:42<18:31, 15.87s/it]

2784


 56%|█████▌    | 88/157 [24:56<17:39, 15.36s/it]

2816


 57%|█████▋    | 89/157 [25:13<18:07, 16.00s/it]

2848


 57%|█████▋    | 90/157 [25:32<18:44, 16.79s/it]

2880


 58%|█████▊    | 91/157 [25:47<18:03, 16.41s/it]

2912


 59%|█████▊    | 92/157 [26:01<16:47, 15.50s/it]

2944


 59%|█████▉    | 93/157 [26:16<16:18, 15.29s/it]

2976


 60%|█████▉    | 94/157 [26:32<16:19, 15.55s/it]

3008


 61%|██████    | 95/157 [26:47<16:06, 15.59s/it]

3040


 61%|██████    | 96/157 [27:04<16:08, 15.88s/it]

3072


 62%|██████▏   | 97/157 [27:19<15:28, 15.48s/it]

3104


 62%|██████▏   | 98/157 [27:32<14:38, 14.88s/it]

3136


 63%|██████▎   | 99/157 [27:45<13:46, 14.26s/it]

3168


 64%|██████▎   | 100/157 [27:58<13:10, 13.87s/it]

3200


 64%|██████▍   | 101/157 [28:11<12:43, 13.64s/it]

3232


 65%|██████▍   | 102/157 [28:23<12:12, 13.31s/it]

3264


 66%|██████▌   | 103/157 [28:36<11:51, 13.17s/it]

3296


 66%|██████▌   | 104/157 [28:50<11:44, 13.30s/it]

3328


 67%|██████▋   | 105/157 [29:03<11:26, 13.21s/it]

3360


 68%|██████▊   | 106/157 [29:17<11:29, 13.51s/it]

3392


 68%|██████▊   | 107/157 [29:30<11:08, 13.37s/it]

3424


 69%|██████▉   | 108/157 [29:44<11:00, 13.48s/it]

3456


 69%|██████▉   | 109/157 [29:57<10:37, 13.28s/it]

3488


 70%|███████   | 110/157 [30:10<10:21, 13.21s/it]

3520


 71%|███████   | 111/157 [30:24<10:17, 13.43s/it]

3552


 71%|███████▏  | 112/157 [30:37<09:59, 13.33s/it]

3584


 72%|███████▏  | 113/157 [31:03<12:33, 17.12s/it]

3616


 73%|███████▎  | 114/157 [31:15<11:08, 15.54s/it]

3648


 73%|███████▎  | 115/157 [31:28<10:26, 14.92s/it]

3680


 74%|███████▍  | 116/157 [31:45<10:30, 15.37s/it]

3712


 75%|███████▍  | 117/157 [32:03<10:53, 16.34s/it]

3744


 75%|███████▌  | 118/157 [32:19<10:37, 16.34s/it]

3776


 76%|███████▌  | 119/157 [32:38<10:42, 16.92s/it]

3808


 76%|███████▋  | 120/157 [32:58<10:58, 17.79s/it]

3840


 77%|███████▋  | 121/157 [33:13<10:16, 17.14s/it]

3872


 78%|███████▊  | 122/157 [33:28<09:32, 16.34s/it]

3904


 78%|███████▊  | 123/157 [33:43<09:08, 16.14s/it]

3936


 79%|███████▉  | 124/157 [33:56<08:19, 15.14s/it]

3968


 80%|███████▉  | 125/157 [34:13<08:23, 15.72s/it]

4000


 80%|████████  | 126/157 [34:39<09:40, 18.73s/it]

4032


 81%|████████  | 127/157 [34:53<08:42, 17.41s/it]

4064


 82%|████████▏ | 128/157 [35:08<07:57, 16.47s/it]

4096


 82%|████████▏ | 129/157 [35:23<07:28, 16.01s/it]

4128


 83%|████████▎ | 130/157 [35:41<07:28, 16.60s/it]

4160


 83%|████████▎ | 131/157 [36:00<07:38, 17.62s/it]

4192


 84%|████████▍ | 132/157 [36:17<07:12, 17.30s/it]

4224


 85%|████████▍ | 133/157 [36:32<06:35, 16.48s/it]

4256


 85%|████████▌ | 134/157 [36:48<06:21, 16.59s/it]

4288


 86%|████████▌ | 135/157 [37:03<05:52, 16.01s/it]

4320


 87%|████████▋ | 136/157 [37:18<05:27, 15.62s/it]

4352


 87%|████████▋ | 137/157 [37:36<05:26, 16.34s/it]

4384


 88%|████████▊ | 138/157 [38:01<06:02, 19.10s/it]

4416


 89%|████████▊ | 139/157 [38:20<05:42, 19.05s/it]

4448


 89%|████████▉ | 140/157 [38:43<05:42, 20.17s/it]

4480


 90%|████████▉ | 141/157 [38:59<05:01, 18.87s/it]

4512


 90%|█████████ | 142/157 [39:13<04:22, 17.48s/it]

4544


 91%|█████████ | 143/157 [39:30<04:01, 17.22s/it]

4576


 92%|█████████▏| 144/157 [39:44<03:32, 16.36s/it]

4608


 92%|█████████▏| 145/157 [39:59<03:11, 15.92s/it]

4640


 93%|█████████▎| 146/157 [40:14<02:53, 15.78s/it]

4672


 94%|█████████▎| 147/157 [40:29<02:35, 15.53s/it]

4704


 94%|█████████▍| 148/157 [40:43<02:14, 14.95s/it]

4736


 95%|█████████▍| 149/157 [41:00<02:03, 15.47s/it]

4768


 96%|█████████▌| 150/157 [41:15<01:47, 15.35s/it]

4800


 96%|█████████▌| 151/157 [41:29<01:29, 14.94s/it]

4832


 97%|█████████▋| 152/157 [41:42<01:11, 14.28s/it]

4864


 97%|█████████▋| 153/157 [41:55<00:56, 14.05s/it]

4896


 98%|█████████▊| 154/157 [42:11<00:43, 14.61s/it]

4928


 99%|█████████▊| 155/157 [42:27<00:29, 14.93s/it]

4960


 99%|█████████▉| 156/157 [42:41<00:14, 14.89s/it]

4992


100%|██████████| 157/157 [42:52<00:00, 16.39s/it]

5000
Results saved to results.json





In [13]:
# 导入swift框架进行微调
from swift.llm import (
    DatasetName, InferArguments, ModelType, SftArguments,
    infer_main, sft_main, app_ui_main
)

In [15]:
model_type = ModelType.qwen2_5_0_5b_instruct
sft_args = SftArguments(
    model_type=model_type,
    dataset=['MATH_train_fineture.jsonl'],
    output_dir='output',
    max_length=4096,
    sft_type = 'full',
    
    )
result = sft_main(sft_args)
last_model_checkpoint = result['last_model_checkpoint']

[INFO:swift] Setting template_type: qwen2_5
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1
[INFO:swift] output_dir: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823
[INFO:swift] Start time of running main: 2024-12-07 15:38:23.736837
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b-instruct', model_id_or_path='qwen/Qwen2.5-0.5B-Instruct', model_revision='master', full_determinism=False, sft_type='full', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='qwen2_5', output_dir='/mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint=None, resume_only_model=False, ignore_data_skip=False, dtype='bf16', packing=False, train_backend='transformers', tp=1, pp=1, min_lr=N

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1
Downloading Model to directory: /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2.5-0.5B-Instruct


[INFO:modelscope] Creating symbolic link /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct -> /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2.5-0.5B-Instruct.
[INFO:swift] Loading the model using model_dir: /mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 32768
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/mnt/workspace/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7500 [00:00<?, ? examples/s]

[INFO:swift] train_dataset: Dataset({
    features: ['query', 'response'],
    num_rows: 7425
})
[INFO:swift] val_dataset: Dataset({
    features: ['query', 'response'],
    num_rows: 75
})
[INFO:swift] [INPUT_IDS] [151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465, 553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 2610, 525, 264, 7548, 25530, 20976, 1103, 12875, 315, 21828, 6351, 11972, 34689, 6888, 5322, 3019, 553, 3019, 13, 5209, 1349, 279, 3491, 15516, 323, 11625, 432, 448, 2797, 32711, 13, 11112, 1493, 11221, 1447, 12, 64547, 279, 3881, 1995, 323, 279, 3405, 1660, 4588, 624, 12, 15623, 279, 6291, 1119, 19819, 7354, 11, 8241, 2797, 40841, 369, 1817, 624, 12, 6928, 678, 28117, 323, 28439, 3059, 624, 12, 1200, 857, 448, 264, 1590, 4226, 382, 5097, 697, 2033, 304, 279, 2701, 3561, 510, 58, 69769, 323, 28117, 921, 820, 508, 19357, 34776, 4226, 60, 33933, 8420, 374, 279, 3491, 510, 4340, 1657, 6785, 9363, 315, 220, 24, 21, 525, 1083, 65060, 315, 2

Map:   0%|          | 0/7425 [00:00<?, ?it/s]

Map:   0%|          | 0/75 [00:00<?, ?it/s]

[INFO:swift] Dataset Token Length: 434.309899±249.057953, min=163.000000, max=2849.000000, size=7425
[INFO:swift] Dataset Token Length: 441.853333±276.012497, min=182.000000, max=1915.000000, size=75
[INFO:swift] training_args: Seq2SeqTrainingArguments(
_n_gpu=1,
acc_strategy=token,
accelerator_config={'split_batches': False, 'dispatch_batches': False, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_epsilon=1e-08,
additional_saved_files=[],
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=42,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeou

[2024-12-07 15:38:45,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  super().__init__(
df: /root/.triton/autotune: 没有那个文件或目录
[INFO:swift] The SftArguments will be saved in: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/sft_args.json
[INFO:swift] The Seq2SeqTrainingArguments will be saved in: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/training_args.json
[INFO:swift] The logging file will be saved in: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/logging.jsonl


Train:   0%|          | 0/464 [00:00<?, ?it/s]

{'loss': 0.91355658, 'acc': 0.74084908, 'grad_norm': 17.625, 'learning_rate': 4.2e-07, 'memory(GiB)': 5.97, 'train_speed(iter/s)': 0.131104, 'epoch': 0.0, 'global_step/max_steps': '1/464', 'percentage': '0.22%', 'elapsed_time': '6s', 'remaining_time': '53m 53s'}
{'loss': 0.81045878, 'acc': 0.77156085, 'grad_norm': 15.625, 'learning_rate': 2.08e-06, 'memory(GiB)': 9.48, 'train_speed(iter/s)': 0.138411, 'epoch': 0.01, 'global_step/max_steps': '5/464', 'percentage': '1.08%', 'elapsed_time': '35s', 'remaining_time': '54m 17s'}
{'loss': 0.72366648, 'acc': 0.77797718, 'grad_norm': 9.4375, 'learning_rate': 4.17e-06, 'memory(GiB)': 14.56, 'train_speed(iter/s)': 0.138284, 'epoch': 0.02, 'global_step/max_steps': '10/464', 'percentage': '2.16%', 'elapsed_time': '1m 11s', 'remaining_time': '54m 13s'}
{'loss': 0.84002457, 'acc': 0.76303234, 'grad_norm': 6.59375, 'learning_rate': 6.25e-06, 'memory(GiB)': 8.65, 'train_speed(iter/s)': 0.139749, 'epoch': 0.03, 'global_step/max_steps': '15/464', 'percen

Val:   0%|          | 0/75 [00:00<?, ?it/s]

{'eval_loss': 0.6997422, 'eval_acc': 0.81847602, 'eval_runtime': 9.9539, 'eval_samples_per_second': 7.535, 'eval_steps_per_second': 7.535, 'epoch': 0.43, 'global_step/max_steps': '200/464', 'percentage': '43.10%', 'elapsed_time': '23m 51s', 'remaining_time': '31m 29s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-200


{'loss': 0.67135997, 'acc': 0.80582895, 'grad_norm': 4.4375, 'learning_rate': 6.37e-06, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.139292, 'epoch': 0.44, 'global_step/max_steps': '205/464', 'percentage': '44.18%', 'elapsed_time': '24m 31s', 'remaining_time': '30m 58s'}
{'loss': 0.75457659, 'acc': 0.78382716, 'grad_norm': 6.125, 'learning_rate': 6.2e-06, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.13952, 'epoch': 0.45, 'global_step/max_steps': '210/464', 'percentage': '45.26%', 'elapsed_time': '25m 4s', 'remaining_time': '30m 19s'}
{'loss': 0.67305217, 'acc': 0.79005198, 'grad_norm': 4.65625, 'learning_rate': 6.03e-06, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.139205, 'epoch': 0.46, 'global_step/max_steps': '215/464', 'percentage': '46.34%', 'elapsed_time': '25m 43s', 'remaining_time': '29m 47s'}
{'loss': 0.72196145, 'acc': 0.78962088, 'grad_norm': 4.5, 'learning_rate': 5.85e-06, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.13911, 'epoch': 0.47, 'global_step/max_steps': '2

Val:   0%|          | 0/75 [00:00<?, ?it/s]

{'eval_loss': 0.69657844, 'eval_acc': 0.82035893, 'eval_runtime': 10.0086, 'eval_samples_per_second': 7.494, 'eval_steps_per_second': 7.494, 'epoch': 0.86, 'global_step/max_steps': '400/464', 'percentage': '86.21%', 'elapsed_time': '47m 52s', 'remaining_time': '7m 39s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-400


{'loss': 0.65712028, 'acc': 0.79932156, 'grad_norm': 4.46875, 'learning_rate': 4.4e-07, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.13904, 'epoch': 0.87, 'global_step/max_steps': '405/464', 'percentage': '87.28%', 'elapsed_time': '48m 32s', 'remaining_time': '7m 4s'}
{'loss': 0.73294439, 'acc': 0.78424091, 'grad_norm': 6.28125, 'learning_rate': 3.7e-07, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.138907, 'epoch': 0.88, 'global_step/max_steps': '410/464', 'percentage': '88.36%', 'elapsed_time': '49m 10s', 'remaining_time': '6m 28s'}
{'loss': 0.73445945, 'acc': 0.78805285, 'grad_norm': 6.15625, 'learning_rate': 3e-07, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.138901, 'epoch': 0.89, 'global_step/max_steps': '415/464', 'percentage': '89.44%', 'elapsed_time': '49m 47s', 'remaining_time': '5m 52s'}
{'loss': 0.67354326, 'acc': 0.78696442, 'grad_norm': 5.28125, 'learning_rate': 2.4e-07, 'memory(GiB)': 10.61, 'train_speed(iter/s)': 0.138971, 'epoch': 0.91, 'global_step/max_steps': '4

Val:   0%|          | 0/75 [00:00<?, ?it/s]

{'eval_loss': 0.69569194, 'eval_acc': 0.81994704, 'eval_runtime': 9.9632, 'eval_samples_per_second': 7.528, 'eval_steps_per_second': 7.528, 'epoch': 1.0, 'global_step/max_steps': '464/464', 'percentage': '100.00%', 'elapsed_time': '55m 43s', 'remaining_time': '0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-464
[INFO:swift] last_model_checkpoint: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-464
[INFO:swift] best_model_checkpoint: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/checkpoint-464
[INFO:swift] images_dir: /mnt/workspace/output/qwen2_5-0_5b-instruct/v1-20241207-153823/images


{'train_runtime': 3348.4523, 'train_samples_per_second': 2.217, 'train_steps_per_second': 0.139, 'train_loss': 0.72626422, 'epoch': 1.0, 'global_step/max_steps': '464/464', 'percentage': '100.00%', 'elapsed_time': '55m 48s', 'remaining_time': '0s'}


[INFO:swift] End time of running main: 2024-12-07 16:34:40.772942
