# 基于LLM微调的数学推理任务

In [1]:
import os 
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import one_hot
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load
from datasets import load_dataset

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 设置代理
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
dataset = load_dataset("openai/gsm8k", "main")

In [4]:
def get_dataloader(prompt, batch_size=1):
    def preprocess(dataset, shuffle=True):
        def collate_fn(batch):
            questions =  [[{"role": "user", "content": prompt.format(text=item['question'])}] for item in batch]
            answers = [item["answer"] for item in batch]
            return questions, answers
        return DataLoader(
            dataset,
            shuffle=shuffle,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
    testset = dataset['test']
    return preprocess(testset, shuffle=False)

In [5]:
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"

def extract_answer(answer):
    match = ANS_RE.search(answer)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        return INVALID_ANS
    
def is_correct(reply, truth):
    answer = extract_answer(truth)
    assert answer != INVALID_ANS
    return extract_answer(reply) == answer

In [6]:
def eval_process(model_name, tokenizer_name, prompt, batch_size=1, ratio=1):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        cache_dir = '/autodl-tmp/cache'
    )
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    testloader = get_dataloader(prompt, batch_size)
    answers = []
    replies = []
    # 以下过程参照huggingface上的qwen2.5模型示例
    for idx, (texts, truths) in tqdm(enumerate(testloader), total=int(len(testloader) * ratio)):
        if idx >= int(len(testloader) * ratio):
            break
        texts = [tokenizer.apply_chat_template(
            text,
            tokenize=False,
            add_generation_prompt=True
        ) for text in texts]
        model_inputs = tokenizer(texts, return_tensors="pt", padding=True, padding_side='left').to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=256
        )

        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        replies.extend(responses)
        answers.extend(truths)
    
    total = 0
    correct = 0
    for reply, answer in zip(replies, answers):
        total += 1
        correct += is_correct(reply, answer)
    print(f"正确率: {(correct / total) * 100: .2f}%")

In [11]:
model_name = "/root/autodl-tmp/checkpoint/math_output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-462"
tokenizer_name = "Qwen/Qwen2.5-0.5B"

prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
eval_process(model_name, tokenizer_name, prompt, batch_size=8, ratio=1)

100%|██████████| 165/165 [15:50<00:00,  5.76s/it]

正确率:  27.75%





In [None]:
model_name = "/root/autodl-tmp/checkpoint/math_output/lora/qwen2_5-0_5b/v0-20241209-200154/checkpoint-462"
tokenizer_name = "Qwen/Qwen2.5-0.5B"

prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
eval_process(model_name, tokenizer_name, prompt, batch_size=8, ratio=1)

  0%|          | 0/165 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/165 [00:09<25:01,  9.15s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 2/165 [00:17<23:06,  8.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 3/165 [00:26<23:56,  8.87s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 4/165 [00:33<22:00,  8.20s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 5/165 [00:41<21:25,  8.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▎         | 6/165 [00:50<22:29,  8.49s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 7/165 [00:57<20:47,  7.89s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▍         | 8/165 [01:03<19:22,  7.40s/it]Setting `pad_toke

正确率:  28.73%





In [None]:
model_name = "/root/autodl-tmp/checkpoint/math_output/lora/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-462"
tokenizer_name = "Qwen/Qwen2.5-0.5B-Instruct"

prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
eval_process(model_name, tokenizer_name, prompt, batch_size=8, ratio=1)

100%|██████████| 165/165 [22:22<00:00,  8.14s/it]

正确率:  27.22%





In [None]:
model_name = "/root/autodl-tmp/checkpoint/math_output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-462"
tokenizer_name = "Qwen/Qwen2.5-0.5B-Instruct"

prompt = '''You are a highly skilled mathematician capable of solving complex grade-school math problems step by step. Please read the problem carefully and solve it with clear reasoning. Follow these instructions:

- Identify the known information and the question being asked.
- Break the solution into logical steps, providing clear explanations for each.
- Show all calculations and intermediate results.
- Conclude with a final answer.

Output your response in the following format:
[Explanation and calculations]
#### [Final numerical answer]   

Here is the problem:
{text}'''
eval_process(model_name, tokenizer_name, prompt, batch_size=8, ratio=1)

100%|██████████| 165/165 [22:17<00:00,  8.11s/it]

正确率:  25.55%





In [1]:
# 导入swift框架进行微调
from swift.llm import (
    DatasetName, InferArguments, ModelType, SftArguments,
    infer_main, sft_main, app_ui_main
)

  from .autonotebook import tqdm as notebook_tqdm
[INFO:swift] Successfully registered `/autodl-tmp/conda/Base/lib/python3.10/site-packages/swift/llm/data/dataset_info.json`
[INFO:swift] No vLLM installed, if you are using vLLM, you will get `ImportError: cannot import name 'get_vllm_engine' from 'swift.llm'`
[INFO:swift] No LMDeploy installed, if you are using LMDeploy, you will get `ImportError: cannot import name 'prepare_lmdeploy_engine_template' from 'swift.llm'`


In [None]:
model_type = ModelType.qwen2_5_0_5b
sft_args = SftArguments(
    model_type=model_type,
    dataset=['gsm8k-train.jsonl'],
    output_dir='/root/autodl-tmp/checkpoint/math_output/full',
    sft_type='full',
    max_length=4096)
result = sft_main(sft_args)

[INFO:swift] Setting template_type: default-generation
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1


[INFO:swift] output_dir: /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648
[INFO:swift] Start time of running main: 2024-12-09 22:36:48.802550
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b', model_id_or_path='qwen/Qwen2.5-0.5B', model_revision='master', full_determinism=False, sft_type='full', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='default-generation', output_dir='/root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint=None, resume_only_model=False, ignore_data_skip=False, dtype='bf16', packing=False, train_backend='transformers', tp=1, pp=1, min_lr=None, sequence_parallel=False, model_kwargs={}, loss_name=None, dataset=['gsm8k-train.jsonl'], val_dataset=[], dataset_seed=4

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1


[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 32768
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_versi

{'loss': 0.64669138, 'acc': 0.84823072, 'grad_norm': 16.25, 'learning_rate': 4.2e-07, 'memory(GiB)': 6.62, 'train_speed(iter/s)': 0.339616, 'epoch': 0.0, 'global_step/max_steps': '1/462', 'percentage': '0.22%', 'elapsed_time': '2s', 'remaining_time': '19m 30s'}


Train:   1%|          | 5/462 [00:11<16:35,  2.18s/it]

{'loss': 0.71370238, 'acc': 0.81628406, 'grad_norm': 14.8125, 'learning_rate': 2.08e-06, 'memory(GiB)': 8.44, 'train_speed(iter/s)': 0.425108, 'epoch': 0.01, 'global_step/max_steps': '5/462', 'percentage': '1.08%', 'elapsed_time': '11s', 'remaining_time': '17m 15s'}


Train:   2%|▏         | 10/462 [00:21<15:55,  2.11s/it]

{'loss': 0.67155805, 'acc': 0.83056583, 'grad_norm': 10.125, 'learning_rate': 4.17e-06, 'memory(GiB)': 8.57, 'train_speed(iter/s)': 0.447288, 'epoch': 0.02, 'global_step/max_steps': '10/462', 'percentage': '2.16%', 'elapsed_time': '21s', 'remaining_time': '16m 30s'}


Train:   3%|▎         | 15/462 [00:32<15:43,  2.11s/it]

{'loss': 0.62042365, 'acc': 0.84549875, 'grad_norm': 11.0625, 'learning_rate': 6.25e-06, 'memory(GiB)': 7.53, 'train_speed(iter/s)': 0.454736, 'epoch': 0.03, 'global_step/max_steps': '15/462', 'percentage': '3.25%', 'elapsed_time': '32s', 'remaining_time': '16m 10s'}


Train:   4%|▍         | 20/462 [00:43<15:25,  2.09s/it]

{'loss': 0.57176838, 'acc': 0.84585199, 'grad_norm': 7.40625, 'learning_rate': 8.33e-06, 'memory(GiB)': 7.81, 'train_speed(iter/s)': 0.459006, 'epoch': 0.04, 'global_step/max_steps': '20/462', 'percentage': '4.33%', 'elapsed_time': '43s', 'remaining_time': '15m 52s'}


Train:   5%|▌         | 25/462 [00:53<15:16,  2.10s/it]

{'loss': 0.49685359, 'acc': 0.86110678, 'grad_norm': 7.21875, 'learning_rate': 1e-05, 'memory(GiB)': 7.81, 'train_speed(iter/s)': 0.46144, 'epoch': 0.05, 'global_step/max_steps': '25/462', 'percentage': '5.41%', 'elapsed_time': '53s', 'remaining_time': '15m 39s'}


Train:   6%|▋         | 30/462 [01:04<15:20,  2.13s/it]

{'loss': 0.50976362, 'acc': 0.86174116, 'grad_norm': 6.375, 'learning_rate': 1e-05, 'memory(GiB)': 8.44, 'train_speed(iter/s)': 0.461917, 'epoch': 0.06, 'global_step/max_steps': '30/462', 'percentage': '6.49%', 'elapsed_time': '1m 4s', 'remaining_time': '15m 28s'}


Train:   8%|▊         | 35/462 [01:15<15:25,  2.17s/it]

{'loss': 0.44555011, 'acc': 0.87243195, 'grad_norm': 5.59375, 'learning_rate': 9.98e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.460704, 'epoch': 0.08, 'global_step/max_steps': '35/462', 'percentage': '7.58%', 'elapsed_time': '1m 15s', 'remaining_time': '15m 21s'}


Train:   9%|▊         | 40/462 [01:26<15:00,  2.13s/it]

{'loss': 0.43459654, 'acc': 0.88209066, 'grad_norm': 6.0, 'learning_rate': 9.97e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.461274, 'epoch': 0.09, 'global_step/max_steps': '40/462', 'percentage': '8.66%', 'elapsed_time': '1m 26s', 'remaining_time': '15m 10s'}


Train:  10%|▉         | 45/462 [01:36<14:37,  2.10s/it]

{'loss': 0.45834103, 'acc': 0.87052431, 'grad_norm': 6.40625, 'learning_rate': 9.94e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.462461, 'epoch': 0.1, 'global_step/max_steps': '45/462', 'percentage': '9.74%', 'elapsed_time': '1m 36s', 'remaining_time': '14m 57s'}


Train:  11%|█         | 50/462 [01:47<14:29,  2.11s/it]

{'loss': 0.44799557, 'acc': 0.87346563, 'grad_norm': 5.59375, 'learning_rate': 9.91e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.463209, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '1m 47s', 'remaining_time': '14m 45s'}


Train:  12%|█▏        | 55/462 [01:58<14:08,  2.08s/it]

{'loss': 0.45544596, 'acc': 0.87684069, 'grad_norm': 5.8125, 'learning_rate': 9.88e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.464392, 'epoch': 0.12, 'global_step/max_steps': '55/462', 'percentage': '11.90%', 'elapsed_time': '1m 57s', 'remaining_time': '14m 33s'}


Train:  13%|█▎        | 60/462 [02:08<14:03,  2.10s/it]

{'loss': 0.46149259, 'acc': 0.86939964, 'grad_norm': 5.875, 'learning_rate': 9.83e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.465067, 'epoch': 0.13, 'global_step/max_steps': '60/462', 'percentage': '12.99%', 'elapsed_time': '2m 8s', 'remaining_time': '14m 21s'}


Train:  14%|█▍        | 65/462 [02:19<13:42,  2.07s/it]

{'loss': 0.4420846, 'acc': 0.87537603, 'grad_norm': 5.75, 'learning_rate': 9.79e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.466107, 'epoch': 0.14, 'global_step/max_steps': '65/462', 'percentage': '14.07%', 'elapsed_time': '2m 19s', 'remaining_time': '14m 9s'}


Train:  15%|█▌        | 70/462 [02:29<13:41,  2.10s/it]

{'loss': 0.44415421, 'acc': 0.87889004, 'grad_norm': 5.09375, 'learning_rate': 9.73e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.466503, 'epoch': 0.15, 'global_step/max_steps': '70/462', 'percentage': '15.15%', 'elapsed_time': '2m 29s', 'remaining_time': '13m 57s'}


Train:  16%|█▌        | 75/462 [02:40<13:26,  2.08s/it]

{'loss': 0.43474293, 'acc': 0.86928854, 'grad_norm': 6.25, 'learning_rate': 9.67e-06, 'memory(GiB)': 7.65, 'train_speed(iter/s)': 0.46717, 'epoch': 0.16, 'global_step/max_steps': '75/462', 'percentage': '16.23%', 'elapsed_time': '2m 40s', 'remaining_time': '13m 46s'}


Train:  17%|█▋        | 80/462 [02:50<13:27,  2.11s/it]

{'loss': 0.44239836, 'acc': 0.8706172, 'grad_norm': 6.0625, 'learning_rate': 9.6e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.467116, 'epoch': 0.17, 'global_step/max_steps': '80/462', 'percentage': '17.32%', 'elapsed_time': '2m 50s', 'remaining_time': '13m 35s'}


Train:  18%|█▊        | 85/462 [03:01<13:02,  2.08s/it]

{'loss': 0.46587768, 'acc': 0.86895056, 'grad_norm': 5.3125, 'learning_rate': 9.53e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.467829, 'epoch': 0.18, 'global_step/max_steps': '85/462', 'percentage': '18.40%', 'elapsed_time': '3m 1s', 'remaining_time': '13m 23s'}


Train:  19%|█▉        | 90/462 [03:11<12:52,  2.08s/it]

{'loss': 0.42981424, 'acc': 0.87043667, 'grad_norm': 5.84375, 'learning_rate': 9.45e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.468231, 'epoch': 0.19, 'global_step/max_steps': '90/462', 'percentage': '19.48%', 'elapsed_time': '3m 11s', 'remaining_time': '13m 12s'}


Train:  21%|██        | 95/462 [03:22<12:48,  2.09s/it]

{'loss': 0.46626439, 'acc': 0.87075911, 'grad_norm': 6.28125, 'learning_rate': 9.37e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.468555, 'epoch': 0.21, 'global_step/max_steps': '95/462', 'percentage': '20.56%', 'elapsed_time': '3m 22s', 'remaining_time': '13m 1s'}


Train:  22%|██▏       | 100/462 [03:32<12:34,  2.08s/it]

{'loss': 0.4100431, 'acc': 0.8801486, 'grad_norm': 6.4375, 'learning_rate': 9.28e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.468921, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '3m 32s', 'remaining_time': '12m 50s'}


Train:  23%|██▎       | 105/462 [03:43<12:30,  2.10s/it]

{'loss': 0.41883187, 'acc': 0.88255129, 'grad_norm': 6.59375, 'learning_rate': 9.18e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.469051, 'epoch': 0.23, 'global_step/max_steps': '105/462', 'percentage': '22.73%', 'elapsed_time': '3m 43s', 'remaining_time': '12m 39s'}


Train:  24%|██▍       | 110/462 [03:53<12:14,  2.09s/it]

{'loss': 0.45312738, 'acc': 0.87183895, 'grad_norm': 5.96875, 'learning_rate': 9.08e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.469305, 'epoch': 0.24, 'global_step/max_steps': '110/462', 'percentage': '23.81%', 'elapsed_time': '3m 53s', 'remaining_time': '12m 28s'}


Train:  25%|██▍       | 115/462 [04:04<12:10,  2.10s/it]

{'loss': 0.46741114, 'acc': 0.86633101, 'grad_norm': 6.46875, 'learning_rate': 8.97e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.46933, 'epoch': 0.25, 'global_step/max_steps': '115/462', 'percentage': '24.89%', 'elapsed_time': '4m 4s', 'remaining_time': '12m 18s'}


Train:  26%|██▌       | 120/462 [04:15<11:53,  2.09s/it]

{'loss': 0.44653382, 'acc': 0.87735529, 'grad_norm': 5.8125, 'learning_rate': 8.86e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.469623, 'epoch': 0.26, 'global_step/max_steps': '120/462', 'percentage': '25.97%', 'elapsed_time': '4m 15s', 'remaining_time': '12m 6s'}


Train:  27%|██▋       | 125/462 [04:26<12:12,  2.17s/it]

{'loss': 0.47672405, 'acc': 0.86772871, 'grad_norm': 7.4375, 'learning_rate': 8.74e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.468854, 'epoch': 0.27, 'global_step/max_steps': '125/462', 'percentage': '27.06%', 'elapsed_time': '4m 26s', 'remaining_time': '11m 57s'}


Train:  28%|██▊       | 130/462 [04:37<12:05,  2.19s/it]

{'loss': 0.39014809, 'acc': 0.89325294, 'grad_norm': 5.6875, 'learning_rate': 8.62e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.468271, 'epoch': 0.28, 'global_step/max_steps': '130/462', 'percentage': '28.14%', 'elapsed_time': '4m 37s', 'remaining_time': '11m 47s'}


Train:  29%|██▉       | 135/462 [04:48<11:57,  2.19s/it]

{'loss': 0.44596066, 'acc': 0.87293949, 'grad_norm': 5.84375, 'learning_rate': 8.5e-06, 'memory(GiB)': 8.61, 'train_speed(iter/s)': 0.467539, 'epoch': 0.29, 'global_step/max_steps': '135/462', 'percentage': '29.22%', 'elapsed_time': '4m 48s', 'remaining_time': '11m 38s'}


Train:  30%|███       | 140/462 [04:59<11:39,  2.17s/it]

{'loss': 0.4383285, 'acc': 0.88091154, 'grad_norm': 5.34375, 'learning_rate': 8.37e-06, 'memory(GiB)': 7.22, 'train_speed(iter/s)': 0.467026, 'epoch': 0.3, 'global_step/max_steps': '140/462', 'percentage': '30.30%', 'elapsed_time': '4m 59s', 'remaining_time': '11m 28s'}


Train:  31%|███▏      | 145/462 [05:09<11:04,  2.10s/it]

{'loss': 0.44853454, 'acc': 0.87310038, 'grad_norm': 7.0625, 'learning_rate': 8.23e-06, 'memory(GiB)': 7.22, 'train_speed(iter/s)': 0.46732, 'epoch': 0.31, 'global_step/max_steps': '145/462', 'percentage': '31.39%', 'elapsed_time': '5m 9s', 'remaining_time': '11m 17s'}


Train:  32%|███▏      | 150/462 [05:20<10:52,  2.09s/it]

{'loss': 0.45327096, 'acc': 0.86870804, 'grad_norm': 6.0, 'learning_rate': 8.09e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.46757, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '5m 20s', 'remaining_time': '11m 6s'}


Train:  34%|███▎      | 155/462 [05:30<10:36,  2.07s/it]

{'loss': 0.3989665, 'acc': 0.88605251, 'grad_norm': 5.9375, 'learning_rate': 7.95e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.467911, 'epoch': 0.34, 'global_step/max_steps': '155/462', 'percentage': '33.55%', 'elapsed_time': '5m 30s', 'remaining_time': '10m 55s'}


Train:  35%|███▍      | 160/462 [05:41<10:29,  2.08s/it]

{'loss': 0.4486958, 'acc': 0.87948952, 'grad_norm': 6.40625, 'learning_rate': 7.8e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.468095, 'epoch': 0.35, 'global_step/max_steps': '160/462', 'percentage': '34.63%', 'elapsed_time': '5m 41s', 'remaining_time': '10m 44s'}


Train:  36%|███▌      | 165/462 [05:51<10:13,  2.07s/it]

{'loss': 0.44073763, 'acc': 0.87208757, 'grad_norm': 6.5625, 'learning_rate': 7.65e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.468449, 'epoch': 0.36, 'global_step/max_steps': '165/462', 'percentage': '35.71%', 'elapsed_time': '5m 51s', 'remaining_time': '10m 33s'}


Train:  37%|███▋      | 170/462 [06:02<10:07,  2.08s/it]

{'loss': 0.42892756, 'acc': 0.88267298, 'grad_norm': 5.5, 'learning_rate': 7.5e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.468609, 'epoch': 0.37, 'global_step/max_steps': '170/462', 'percentage': '36.80%', 'elapsed_time': '6m 2s', 'remaining_time': '10m 22s'}


Train:  38%|███▊      | 175/462 [06:12<09:56,  2.08s/it]

{'loss': 0.47985568, 'acc': 0.86757927, 'grad_norm': 7.8125, 'learning_rate': 7.34e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.468888, 'epoch': 0.38, 'global_step/max_steps': '175/462', 'percentage': '37.88%', 'elapsed_time': '6m 12s', 'remaining_time': '10m 11s'}


Train:  39%|███▉      | 180/462 [06:23<09:46,  2.08s/it]

{'loss': 0.46823626, 'acc': 0.8671526, 'grad_norm': 6.8125, 'learning_rate': 7.18e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.469047, 'epoch': 0.39, 'global_step/max_steps': '180/462', 'percentage': '38.96%', 'elapsed_time': '6m 23s', 'remaining_time': '10m 0s'}


Train:  40%|████      | 185/462 [06:33<09:40,  2.09s/it]

{'loss': 0.47024808, 'acc': 0.87103701, 'grad_norm': 5.875, 'learning_rate': 7.02e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.469172, 'epoch': 0.4, 'global_step/max_steps': '185/462', 'percentage': '40.04%', 'elapsed_time': '6m 33s', 'remaining_time': '9m 49s'}


Train:  41%|████      | 190/462 [06:44<09:29,  2.09s/it]

{'loss': 0.40786028, 'acc': 0.88973007, 'grad_norm': 6.09375, 'learning_rate': 6.86e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.469266, 'epoch': 0.41, 'global_step/max_steps': '190/462', 'percentage': '41.13%', 'elapsed_time': '6m 44s', 'remaining_time': '9m 39s'}


Train:  42%|████▏     | 195/462 [06:54<09:18,  2.09s/it]

{'loss': 0.42017059, 'acc': 0.88008738, 'grad_norm': 6.3125, 'learning_rate': 6.69e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.4694, 'epoch': 0.42, 'global_step/max_steps': '195/462', 'percentage': '42.21%', 'elapsed_time': '6m 54s', 'remaining_time': '9m 28s'}


Train:  43%|████▎     | 200/462 [07:05<09:03,  2.07s/it]

{'loss': 0.41872044, 'acc': 0.88367443, 'grad_norm': 6.625, 'learning_rate': 6.52e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.469624, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '7m 5s', 'remaining_time': '9m 17s'}



Val: 100%|██████████| 74/74 [00:02<00:00, 29.01it/s]/it]


{'eval_loss': 0.43698326, 'eval_acc': 0.86903418, 'eval_runtime': 2.5614, 'eval_samples_per_second': 28.89, 'eval_steps_per_second': 28.89, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '7m 8s', 'remaining_time': '9m 20s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-200
Train:  44%|████▍     | 205/462 [07:40<16:36,  3.88s/it]

{'loss': 0.42253075, 'acc': 0.87786732, 'grad_norm': 6.0, 'learning_rate': 6.35e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.445128, 'epoch': 0.44, 'global_step/max_steps': '205/462', 'percentage': '44.37%', 'elapsed_time': '7m 40s', 'remaining_time': '9m 36s'}


Train:  45%|████▌     | 210/462 [07:51<10:28,  2.50s/it]

{'loss': 0.44077153, 'acc': 0.87265043, 'grad_norm': 6.125, 'learning_rate': 6.17e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.445207, 'epoch': 0.45, 'global_step/max_steps': '210/462', 'percentage': '45.45%', 'elapsed_time': '7m 51s', 'remaining_time': '9m 25s'}


Train:  47%|████▋     | 215/462 [08:01<08:50,  2.15s/it]

{'loss': 0.42533712, 'acc': 0.88091545, 'grad_norm': 5.5625, 'learning_rate': 6e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.445881, 'epoch': 0.46, 'global_step/max_steps': '215/462', 'percentage': '46.54%', 'elapsed_time': '8m 1s', 'remaining_time': '9m 13s'}


Train:  48%|████▊     | 220/462 [08:12<08:29,  2.10s/it]

{'loss': 0.41124663, 'acc': 0.88325043, 'grad_norm': 6.25, 'learning_rate': 5.82e-06, 'memory(GiB)': 8.24, 'train_speed(iter/s)': 0.446472, 'epoch': 0.48, 'global_step/max_steps': '220/462', 'percentage': '47.62%', 'elapsed_time': '8m 12s', 'remaining_time': '9m 1s'}


Train:  49%|████▊     | 225/462 [08:22<08:14,  2.09s/it]

{'loss': 0.45162144, 'acc': 0.87530546, 'grad_norm': 5.59375, 'learning_rate': 5.64e-06, 'memory(GiB)': 8.25, 'train_speed(iter/s)': 0.447073, 'epoch': 0.49, 'global_step/max_steps': '225/462', 'percentage': '48.70%', 'elapsed_time': '8m 22s', 'remaining_time': '8m 49s'}


Train:  50%|████▉     | 230/462 [08:33<08:06,  2.10s/it]

{'loss': 0.44505281, 'acc': 0.8808383, 'grad_norm': 6.28125, 'learning_rate': 5.47e-06, 'memory(GiB)': 8.25, 'train_speed(iter/s)': 0.44761, 'epoch': 0.5, 'global_step/max_steps': '230/462', 'percentage': '49.78%', 'elapsed_time': '8m 33s', 'remaining_time': '8m 37s'}


Train:  51%|█████     | 235/462 [08:43<07:50,  2.07s/it]

{'loss': 0.46902652, 'acc': 0.86529016, 'grad_norm': 6.34375, 'learning_rate': 5.29e-06, 'memory(GiB)': 8.25, 'train_speed(iter/s)': 0.448256, 'epoch': 0.51, 'global_step/max_steps': '235/462', 'percentage': '50.87%', 'elapsed_time': '8m 43s', 'remaining_time': '8m 25s'}


Train:  52%|█████▏    | 240/462 [08:54<07:46,  2.10s/it]

{'loss': 0.4009481, 'acc': 0.88493366, 'grad_norm': 5.8125, 'learning_rate': 5.11e-06, 'memory(GiB)': 8.25, 'train_speed(iter/s)': 0.448704, 'epoch': 0.52, 'global_step/max_steps': '240/462', 'percentage': '51.95%', 'elapsed_time': '8m 54s', 'remaining_time': '8m 14s'}


Train:  53%|█████▎    | 245/462 [09:05<08:12,  2.27s/it]

{'loss': 0.40101929, 'acc': 0.88684225, 'grad_norm': 6.125, 'learning_rate': 4.93e-06, 'memory(GiB)': 8.59, 'train_speed(iter/s)': 0.448662, 'epoch': 0.53, 'global_step/max_steps': '245/462', 'percentage': '53.03%', 'elapsed_time': '9m 5s', 'remaining_time': '8m 3s'}


Train:  54%|█████▍    | 250/462 [09:18<08:43,  2.47s/it]

{'loss': 0.41742001, 'acc': 0.88454447, 'grad_norm': 6.0625, 'learning_rate': 4.75e-06, 'memory(GiB)': 7.26, 'train_speed(iter/s)': 0.447407, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '9m 18s', 'remaining_time': '7m 53s'}


Train:  55%|█████▌    | 255/462 [09:29<07:50,  2.27s/it]

{'loss': 0.4004281, 'acc': 0.88458242, 'grad_norm': 6.78125, 'learning_rate': 4.57e-06, 'memory(GiB)': 7.26, 'train_speed(iter/s)': 0.447374, 'epoch': 0.55, 'global_step/max_steps': '255/462', 'percentage': '55.19%', 'elapsed_time': '9m 29s', 'remaining_time': '7m 42s'}


Train:  56%|█████▋    | 260/462 [09:39<07:03,  2.10s/it]

{'loss': 0.43661718, 'acc': 0.87753553, 'grad_norm': 5.59375, 'learning_rate': 4.39e-06, 'memory(GiB)': 7.26, 'train_speed(iter/s)': 0.447968, 'epoch': 0.56, 'global_step/max_steps': '260/462', 'percentage': '56.28%', 'elapsed_time': '9m 39s', 'remaining_time': '7m 30s'}


Train:  57%|█████▋    | 265/462 [09:50<06:52,  2.09s/it]

{'loss': 0.421841, 'acc': 0.87645102, 'grad_norm': 6.46875, 'learning_rate': 4.21e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.448427, 'epoch': 0.57, 'global_step/max_steps': '265/462', 'percentage': '57.36%', 'elapsed_time': '9m 50s', 'remaining_time': '7m 19s'}


Train:  58%|█████▊    | 270/462 [10:01<06:39,  2.08s/it]

{'loss': 0.40569253, 'acc': 0.88353338, 'grad_norm': 6.4375, 'learning_rate': 4.04e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.448875, 'epoch': 0.58, 'global_step/max_steps': '270/462', 'percentage': '58.44%', 'elapsed_time': '10m 1s', 'remaining_time': '7m 7s'}


Train:  60%|█████▉    | 275/462 [10:11<06:30,  2.09s/it]

{'loss': 0.47775807, 'acc': 0.86866789, 'grad_norm': 7.09375, 'learning_rate': 3.86e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.449267, 'epoch': 0.59, 'global_step/max_steps': '275/462', 'percentage': '59.52%', 'elapsed_time': '10m 11s', 'remaining_time': '6m 55s'}


Train:  61%|██████    | 280/462 [10:22<06:19,  2.08s/it]

{'loss': 0.43962159, 'acc': 0.87524185, 'grad_norm': 7.4375, 'learning_rate': 3.69e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.449686, 'epoch': 0.61, 'global_step/max_steps': '280/462', 'percentage': '60.61%', 'elapsed_time': '10m 22s', 'remaining_time': '6m 44s'}


Train:  62%|██████▏   | 285/462 [10:32<06:10,  2.09s/it]

{'loss': 0.3806664, 'acc': 0.88917246, 'grad_norm': 5.8125, 'learning_rate': 3.52e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.450042, 'epoch': 0.62, 'global_step/max_steps': '285/462', 'percentage': '61.69%', 'elapsed_time': '10m 32s', 'remaining_time': '6m 33s'}


Train:  63%|██████▎   | 290/462 [10:43<05:59,  2.09s/it]

{'loss': 0.4054492, 'acc': 0.87858601, 'grad_norm': 5.96875, 'learning_rate': 3.35e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.450442, 'epoch': 0.63, 'global_step/max_steps': '290/462', 'percentage': '62.77%', 'elapsed_time': '10m 43s', 'remaining_time': '6m 21s'}


Train:  64%|██████▍   | 295/462 [10:53<05:46,  2.08s/it]

{'loss': 0.40784254, 'acc': 0.88881702, 'grad_norm': 6.15625, 'learning_rate': 3.18e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.450858, 'epoch': 0.64, 'global_step/max_steps': '295/462', 'percentage': '63.85%', 'elapsed_time': '10m 53s', 'remaining_time': '6m 10s'}


Train:  65%|██████▍   | 300/462 [11:04<05:48,  2.15s/it]

{'loss': 0.44177575, 'acc': 0.86967659, 'grad_norm': 6.71875, 'learning_rate': 3.01e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.451091, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '11m 4s', 'remaining_time': '5m 58s'}


Train:  66%|██████▌   | 305/462 [11:15<05:44,  2.19s/it]

{'loss': 0.44521537, 'acc': 0.87901726, 'grad_norm': 6.46875, 'learning_rate': 2.85e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.451069, 'epoch': 0.66, 'global_step/max_steps': '305/462', 'percentage': '66.02%', 'elapsed_time': '11m 15s', 'remaining_time': '5m 47s'}


Train:  67%|██████▋   | 310/462 [11:26<05:24,  2.13s/it]

{'loss': 0.38186073, 'acc': 0.89235945, 'grad_norm': 7.125, 'learning_rate': 2.69e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.451259, 'epoch': 0.67, 'global_step/max_steps': '310/462', 'percentage': '67.10%', 'elapsed_time': '11m 26s', 'remaining_time': '5m 36s'}


Train:  68%|██████▊   | 315/462 [11:37<05:05,  2.08s/it]

{'loss': 0.43760486, 'acc': 0.87895746, 'grad_norm': 6.96875, 'learning_rate': 2.53e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.451644, 'epoch': 0.68, 'global_step/max_steps': '315/462', 'percentage': '68.18%', 'elapsed_time': '11m 36s', 'remaining_time': '5m 25s'}


Train:  69%|██████▉   | 320/462 [11:47<04:55,  2.08s/it]

{'loss': 0.43825426, 'acc': 0.87439137, 'grad_norm': 5.21875, 'learning_rate': 2.38e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.452002, 'epoch': 0.69, 'global_step/max_steps': '320/462', 'percentage': '69.26%', 'elapsed_time': '11m 47s', 'remaining_time': '5m 13s'}


Train:  70%|███████   | 325/462 [11:58<04:45,  2.08s/it]

{'loss': 0.41606593, 'acc': 0.87982016, 'grad_norm': 10.625, 'learning_rate': 2.23e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.452357, 'epoch': 0.7, 'global_step/max_steps': '325/462', 'percentage': '70.35%', 'elapsed_time': '11m 58s', 'remaining_time': '5m 2s'}


Train:  71%|███████▏  | 330/462 [12:08<04:35,  2.09s/it]

{'loss': 0.43278565, 'acc': 0.87517748, 'grad_norm': 6.90625, 'learning_rate': 2.08e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.452657, 'epoch': 0.71, 'global_step/max_steps': '330/462', 'percentage': '71.43%', 'elapsed_time': '12m 8s', 'remaining_time': '4m 51s'}


Train:  73%|███████▎  | 335/462 [12:19<04:25,  2.09s/it]

{'loss': 0.45245466, 'acc': 0.87211742, 'grad_norm': 8.1875, 'learning_rate': 1.93e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.452972, 'epoch': 0.72, 'global_step/max_steps': '335/462', 'percentage': '72.51%', 'elapsed_time': '12m 19s', 'remaining_time': '4m 40s'}


Train:  74%|███████▎  | 340/462 [12:29<04:14,  2.09s/it]

{'loss': 0.37482979, 'acc': 0.89149399, 'grad_norm': 5.78125, 'learning_rate': 1.8e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.453228, 'epoch': 0.74, 'global_step/max_steps': '340/462', 'percentage': '73.59%', 'elapsed_time': '12m 29s', 'remaining_time': '4m 29s'}


Train:  75%|███████▍  | 345/462 [12:40<04:07,  2.11s/it]

{'loss': 0.43801274, 'acc': 0.87929878, 'grad_norm': 8.25, 'learning_rate': 1.66e-06, 'memory(GiB)': 7.27, 'train_speed(iter/s)': 0.453481, 'epoch': 0.75, 'global_step/max_steps': '345/462', 'percentage': '74.68%', 'elapsed_time': '12m 40s', 'remaining_time': '4m 17s'}


Train:  76%|███████▌  | 350/462 [12:50<03:53,  2.09s/it]

{'loss': 0.40552182, 'acc': 0.87954445, 'grad_norm': 6.125, 'learning_rate': 1.53e-06, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.453782, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '12m 50s', 'remaining_time': '4m 6s'}


Train:  77%|███████▋  | 355/462 [13:01<03:43,  2.09s/it]

{'loss': 0.41433969, 'acc': 0.88076334, 'grad_norm': 7.21875, 'learning_rate': 1.4e-06, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.454063, 'epoch': 0.77, 'global_step/max_steps': '355/462', 'percentage': '76.84%', 'elapsed_time': '13m 1s', 'remaining_time': '3m 55s'}


Train:  78%|███████▊  | 360/462 [13:11<03:31,  2.07s/it]

{'loss': 0.44364929, 'acc': 0.87276258, 'grad_norm': 6.8125, 'learning_rate': 1.28e-06, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.454396, 'epoch': 0.78, 'global_step/max_steps': '360/462', 'percentage': '77.92%', 'elapsed_time': '13m 11s', 'remaining_time': '3m 44s'}


Train:  79%|███████▉  | 365/462 [13:22<03:22,  2.08s/it]

{'loss': 0.46198277, 'acc': 0.87207022, 'grad_norm': 6.59375, 'learning_rate': 1.16e-06, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.454652, 'epoch': 0.79, 'global_step/max_steps': '365/462', 'percentage': '79.00%', 'elapsed_time': '13m 22s', 'remaining_time': '3m 33s'}


Train:  80%|████████  | 370/462 [13:32<03:11,  2.08s/it]

{'loss': 0.38902857, 'acc': 0.88763561, 'grad_norm': 7.125, 'learning_rate': 1.05e-06, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.454922, 'epoch': 0.8, 'global_step/max_steps': '370/462', 'percentage': '80.09%', 'elapsed_time': '13m 32s', 'remaining_time': '3m 22s'}


Train:  81%|████████  | 375/462 [13:43<03:01,  2.09s/it]

{'loss': 0.40151787, 'acc': 0.88665905, 'grad_norm': 5.65625, 'learning_rate': 9.4e-07, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.455133, 'epoch': 0.81, 'global_step/max_steps': '375/462', 'percentage': '81.17%', 'elapsed_time': '13m 43s', 'remaining_time': '3m 11s'}


Train:  82%|████████▏ | 380/462 [13:54<02:55,  2.14s/it]

{'loss': 0.40462136, 'acc': 0.88763514, 'grad_norm': 5.84375, 'learning_rate': 8.4e-07, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.455271, 'epoch': 0.82, 'global_step/max_steps': '380/462', 'percentage': '82.25%', 'elapsed_time': '13m 54s', 'remaining_time': '3m 0s'}


Train:  83%|████████▎ | 385/462 [14:04<02:42,  2.12s/it]

{'loss': 0.40592098, 'acc': 0.88324242, 'grad_norm': 5.96875, 'learning_rate': 7.4e-07, 'memory(GiB)': 8.32, 'train_speed(iter/s)': 0.455418, 'epoch': 0.83, 'global_step/max_steps': '385/462', 'percentage': '83.33%', 'elapsed_time': '14m 4s', 'remaining_time': '2m 48s'}


Train:  84%|████████▍ | 390/462 [14:15<02:33,  2.13s/it]

{'loss': 0.43130865, 'acc': 0.87413568, 'grad_norm': 6.25, 'learning_rate': 6.5e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.455538, 'epoch': 0.84, 'global_step/max_steps': '390/462', 'percentage': '84.42%', 'elapsed_time': '14m 15s', 'remaining_time': '2m 37s'}


Train:  85%|████████▌ | 395/462 [14:26<02:20,  2.09s/it]

{'loss': 0.40472817, 'acc': 0.88169823, 'grad_norm': 5.09375, 'learning_rate': 5.7e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.455779, 'epoch': 0.85, 'global_step/max_steps': '395/462', 'percentage': '85.50%', 'elapsed_time': '14m 26s', 'remaining_time': '2m 26s'}


Train:  87%|████████▋ | 400/462 [14:36<02:10,  2.10s/it]

{'loss': 0.42208033, 'acc': 0.8804883, 'grad_norm': 5.375, 'learning_rate': 4.9e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.455949, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '14m 36s', 'remaining_time': '2m 15s'}



Val: 100%|██████████| 74/74 [00:02<00:00, 28.85it/s]/it]


{'eval_loss': 0.43378726, 'eval_acc': 0.87108693, 'eval_runtime': 2.5513, 'eval_samples_per_second': 29.005, 'eval_steps_per_second': 29.005, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '14m 39s', 'remaining_time': '2m 16s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-400
Train:  88%|████████▊ | 405/462 [15:10<03:34,  3.76s/it]

{'loss': 0.40805464, 'acc': 0.88281212, 'grad_norm': 6.375, 'learning_rate': 4.1e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.444587, 'epoch': 0.88, 'global_step/max_steps': '405/462', 'percentage': '87.66%', 'elapsed_time': '15m 10s', 'remaining_time': '2m 8s'}


Train:  89%|████████▊ | 410/462 [15:21<02:03,  2.37s/it]

{'loss': 0.42256579, 'acc': 0.87914848, 'grad_norm': 6.59375, 'learning_rate': 3.4e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.444886, 'epoch': 0.89, 'global_step/max_steps': '410/462', 'percentage': '88.74%', 'elapsed_time': '15m 21s', 'remaining_time': '1m 56s'}


Train:  90%|████████▉ | 415/462 [15:31<01:40,  2.14s/it]

{'loss': 0.43028092, 'acc': 0.87979279, 'grad_norm': 6.25, 'learning_rate': 2.8e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.445225, 'epoch': 0.9, 'global_step/max_steps': '415/462', 'percentage': '89.83%', 'elapsed_time': '15m 31s', 'remaining_time': '1m 45s'}


Train:  91%|█████████ | 420/462 [15:42<01:27,  2.09s/it]

{'loss': 0.41020551, 'acc': 0.88423643, 'grad_norm': 6.15625, 'learning_rate': 2.3e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.445551, 'epoch': 0.91, 'global_step/max_steps': '420/462', 'percentage': '90.91%', 'elapsed_time': '15m 42s', 'remaining_time': '1m 34s'}


Train:  92%|█████████▏| 425/462 [15:52<01:17,  2.10s/it]

{'loss': 0.43993287, 'acc': 0.87606812, 'grad_norm': 8.375, 'learning_rate': 1.8e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.44585, 'epoch': 0.92, 'global_step/max_steps': '425/462', 'percentage': '91.99%', 'elapsed_time': '15m 52s', 'remaining_time': '1m 22s'}


Train:  93%|█████████▎| 430/462 [16:03<01:06,  2.08s/it]

{'loss': 0.48591123, 'acc': 0.86472979, 'grad_norm': 6.21875, 'learning_rate': 1.3e-07, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.446189, 'epoch': 0.93, 'global_step/max_steps': '430/462', 'percentage': '93.07%', 'elapsed_time': '16m 3s', 'remaining_time': '1m 11s'}


Train:  94%|█████████▍| 435/462 [16:14<00:58,  2.16s/it]

{'loss': 0.43820033, 'acc': 0.87203703, 'grad_norm': 5.8125, 'learning_rate': 9e-08, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.446311, 'epoch': 0.94, 'global_step/max_steps': '435/462', 'percentage': '94.16%', 'elapsed_time': '16m 14s', 'remaining_time': '1m 0s'}


Train:  95%|█████████▌| 440/462 [16:24<00:46,  2.12s/it]

{'loss': 0.47062893, 'acc': 0.86871014, 'grad_norm': 6.78125, 'learning_rate': 6e-08, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.446547, 'epoch': 0.95, 'global_step/max_steps': '440/462', 'percentage': '95.24%', 'elapsed_time': '16m 24s', 'remaining_time': '49s'}


Train:  96%|█████████▋| 445/462 [16:35<00:35,  2.11s/it]

{'loss': 0.41968679, 'acc': 0.87889013, 'grad_norm': 8.5625, 'learning_rate': 4e-08, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.446767, 'epoch': 0.96, 'global_step/max_steps': '445/462', 'percentage': '96.32%', 'elapsed_time': '16m 35s', 'remaining_time': '38s'}


Train:  97%|█████████▋| 450/462 [16:46<00:25,  2.10s/it]

{'loss': 0.42376394, 'acc': 0.88604107, 'grad_norm': 6.78125, 'learning_rate': 2e-08, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.447038, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '16m 46s', 'remaining_time': '26s'}


Train:  98%|█████████▊| 455/462 [16:56<00:14,  2.09s/it]

{'loss': 0.41826539, 'acc': 0.87786999, 'grad_norm': 6.65625, 'learning_rate': 1e-08, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.447319, 'epoch': 0.98, 'global_step/max_steps': '455/462', 'percentage': '98.48%', 'elapsed_time': '16m 56s', 'remaining_time': '15s'}


Train: 100%|█████████▉| 460/462 [17:07<00:04,  2.09s/it]

{'loss': 0.43608446, 'acc': 0.87566576, 'grad_norm': 5.71875, 'learning_rate': 0.0, 'memory(GiB)': 7.33, 'train_speed(iter/s)': 0.447598, 'epoch': 0.99, 'global_step/max_steps': '460/462', 'percentage': '99.57%', 'elapsed_time': '17m 7s', 'remaining_time': '4s'}


Train: 100%|██████████| 462/462 [17:11<00:00,  2.11s/it]
Val: 100%|██████████| 74/74 [00:02<00:00, 29.26it/s]/it]


{'eval_loss': 0.43423527, 'eval_acc': 0.87036847, 'eval_runtime': 2.545, 'eval_samples_per_second': 29.076, 'eval_steps_per_second': 29.076, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '17m 14s', 'remaining_time': '0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-462
Train: 100%|██████████| 462/462 [17:34<00:00,  2.28s/it]
[INFO:swift] last_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-462
[INFO:swift] best_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/checkpoint-400
[INFO:swift] images_dir: /root/autodl-fs/code/nlp_pj/pj6/output/full/qwen2_5-0_5b/v0-20241209-223648/images


{'train_runtime': 1054.1679, 'train_samples_per_second': 7.019, 'train_steps_per_second': 0.438, 'train_loss': 0.44293416, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '17m 34s', 'remaining_time': '0s'}


[INFO:swift] End time of running main: 2024-12-09 22:54:36.960249


In [None]:
model_type = ModelType.qwen2_5_0_5b
sft_args = SftArguments(
    model_type=model_type,
    dataset=['gsm8k-train.jsonl'],
    output_dir='root/autodl-tmp/checkpoint/math_output/lora',
    max_length=4096)
result = sft_main(sft_args)

[INFO:swift] Setting template_type: default-generation
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1
[INFO:swift] output_dir: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154
[INFO:swift] Start time of running main: 2024-12-09 20:01:54.103115
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b', model_id_or_path='qwen/Qwen2.5-0.5B', model_revision='master', full_determinism=False, sft_type='lora', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='default-generation', output_dir='/root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint=None, resume_only_model=False, ignore_data_skip=False, dtype='bf16', packing=False, train_backend='transformers', 

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1


Downloading [config.json]: 100%|██████████| 681/681 [00:00<00:00, 2.66kB/s]
Downloading [configuration.json]: 100%|██████████| 2.00/2.00 [00:00<00:00, 7.19B/s]
Downloading [generation_config.json]: 100%|██████████| 138/138 [00:00<00:00, 511B/s]
Downloading [LICENSE]: 100%|██████████| 11.1k/11.1k [00:00<00:00, 45.0kB/s]
Downloading [merges.txt]: 100%|██████████| 1.59M/1.59M [00:00<00:00, 4.67MB/s]
Downloading [model.safetensors]: 100%|██████████| 942M/942M [00:16<00:00, 60.1MB/s] 
Downloading [README.md]: 100%|██████████| 3.76k/3.76k [00:00<00:00, 13.6kB/s]
Downloading [tokenizer.json]: 100%|██████████| 6.71M/6.71M [00:00<00:00, 16.4MB/s]
Downloading [tokenizer_config.json]: 100%|██████████| 7.06k/7.06k [00:00<00:00, 22.5kB/s]
Downloading [vocab.json]: 100%|██████████| 2.65M/2.65M [00:00<00:00, 6.87MB/s]
[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 3

{'loss': 0.64669138, 'acc': 0.84823072, 'grad_norm': 2.0086987, 'learning_rate': 4.17e-06, 'memory(GiB)': 3.05, 'train_speed(iter/s)': 0.160726, 'epoch': 0.0, 'global_step/max_steps': '1/462', 'percentage': '0.22%', 'elapsed_time': '6s', 'remaining_time': '50m 14s'}


Train:   1%|          | 5/462 [00:20<24:56,  3.28s/it]

{'loss': 0.714706, 'acc': 0.81494027, 'grad_norm': 1.7821455, 'learning_rate': 2.083e-05, 'memory(GiB)': 6.06, 'train_speed(iter/s)': 0.253563, 'epoch': 0.01, 'global_step/max_steps': '5/462', 'percentage': '1.08%', 'elapsed_time': '20s', 'remaining_time': '31m 35s'}


Train:   2%|▏         | 10/462 [00:36<22:45,  3.02s/it]

{'loss': 0.68415051, 'acc': 0.82539072, 'grad_norm': 1.36415637, 'learning_rate': 4.167e-05, 'memory(GiB)': 7.79, 'train_speed(iter/s)': 0.277177, 'epoch': 0.02, 'global_step/max_steps': '10/462', 'percentage': '2.16%', 'elapsed_time': '35s', 'remaining_time': '27m 6s'}


Train:   3%|▎         | 15/462 [00:54<21:43,  2.92s/it]

{'loss': 0.6651732, 'acc': 0.83450327, 'grad_norm': 1.49173295, 'learning_rate': 6.25e-05, 'memory(GiB)': 7.79, 'train_speed(iter/s)': 0.290908, 'epoch': 0.03, 'global_step/max_steps': '15/462', 'percentage': '3.25%', 'elapsed_time': '54s', 'remaining_time': '26m 55s'}


Train:   4%|▍         | 20/462 [01:09<22:49,  3.10s/it]

{'loss': 0.65492306, 'acc': 0.83486719, 'grad_norm': 1.13358951, 'learning_rate': 8.333e-05, 'memory(GiB)': 7.79, 'train_speed(iter/s)': 0.287359, 'epoch': 0.04, 'global_step/max_steps': '20/462', 'percentage': '4.33%', 'elapsed_time': '1m 9s', 'remaining_time': '25m 34s'}


Train:   5%|▌         | 25/462 [01:24<21:19,  2.93s/it]

{'loss': 0.58878369, 'acc': 0.84718914, 'grad_norm': 1.20448768, 'learning_rate': 0.0001, 'memory(GiB)': 7.79, 'train_speed(iter/s)': 0.294742, 'epoch': 0.05, 'global_step/max_steps': '25/462', 'percentage': '5.41%', 'elapsed_time': '1m 24s', 'remaining_time': '24m 43s'}


Train:   6%|▋         | 30/462 [01:39<20:58,  2.91s/it]

{'loss': 0.58080516, 'acc': 0.84343548, 'grad_norm': 1.13208544, 'learning_rate': 9.995e-05, 'memory(GiB)': 8.41, 'train_speed(iter/s)': 0.299473, 'epoch': 0.06, 'global_step/max_steps': '30/462', 'percentage': '6.49%', 'elapsed_time': '1m 39s', 'remaining_time': '23m 47s'}


Train:   8%|▊         | 35/462 [01:56<20:05,  2.82s/it]

{'loss': 0.4958427, 'acc': 0.85696096, 'grad_norm': 0.90168065, 'learning_rate': 9.984e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.305439, 'epoch': 0.08, 'global_step/max_steps': '35/462', 'percentage': '7.58%', 'elapsed_time': '1m 56s', 'remaining_time': '23m 42s'}


Train:   9%|▊         | 40/462 [02:12<21:09,  3.01s/it]

{'loss': 0.47437372, 'acc': 0.8707262, 'grad_norm': 1.08216894, 'learning_rate': 9.967e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.303424, 'epoch': 0.09, 'global_step/max_steps': '40/462', 'percentage': '8.66%', 'elapsed_time': '2m 12s', 'remaining_time': '23m 16s'}


Train:  10%|▉         | 45/462 [02:26<20:32,  2.96s/it]

{'loss': 0.49369736, 'acc': 0.85884161, 'grad_norm': 0.97431314, 'learning_rate': 9.943e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.304562, 'epoch': 0.1, 'global_step/max_steps': '45/462', 'percentage': '9.74%', 'elapsed_time': '2m 26s', 'remaining_time': '22m 39s'}


Train:  11%|█         | 50/462 [02:41<19:27,  2.83s/it]

{'loss': 0.47300501, 'acc': 0.86700554, 'grad_norm': 0.83477801, 'learning_rate': 9.913e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.308591, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 41s', 'remaining_time': '22m 11s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 16.47it/s]it]


{'eval_loss': 0.48627183, 'eval_acc': 0.85733347, 'eval_runtime': 3.5689, 'eval_samples_per_second': 20.735, 'eval_steps_per_second': 20.735, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 46s', 'remaining_time': '22m 49s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-50
Train:  12%|█▏        | 55/462 [03:07<24:32,  3.62s/it]

{'loss': 0.47813077, 'acc': 0.86757412, 'grad_norm': 0.97646165, 'learning_rate': 9.877e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.293228, 'epoch': 0.12, 'global_step/max_steps': '55/462', 'percentage': '11.90%', 'elapsed_time': '3m 7s', 'remaining_time': '23m 4s'}


Train:  13%|█▎        | 60/462 [03:21<20:16,  3.03s/it]

{'loss': 0.48017311, 'acc': 0.8661067, 'grad_norm': 0.88099188, 'learning_rate': 9.834e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.295815, 'epoch': 0.13, 'global_step/max_steps': '60/462', 'percentage': '12.99%', 'elapsed_time': '3m 21s', 'remaining_time': '22m 32s'}


Train:  14%|█▍        | 65/462 [03:37<18:46,  2.84s/it]

{'loss': 0.46036024, 'acc': 0.87222214, 'grad_norm': 0.94213021, 'learning_rate': 9.785e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.299454, 'epoch': 0.14, 'global_step/max_steps': '65/462', 'percentage': '14.07%', 'elapsed_time': '3m 37s', 'remaining_time': '22m 6s'}


Train:  15%|█▌        | 70/462 [03:51<18:58,  2.90s/it]

{'loss': 0.46135287, 'acc': 0.87448988, 'grad_norm': 0.82518834, 'learning_rate': 9.73e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.300942, 'epoch': 0.15, 'global_step/max_steps': '70/462', 'percentage': '15.15%', 'elapsed_time': '3m 51s', 'remaining_time': '21m 35s'}


Train:  16%|█▌        | 75/462 [04:06<18:03,  2.80s/it]

{'loss': 0.45075178, 'acc': 0.86804428, 'grad_norm': 0.94393569, 'learning_rate': 9.669e-05, 'memory(GiB)': 1.98, 'train_speed(iter/s)': 0.304074, 'epoch': 0.16, 'global_step/max_steps': '75/462', 'percentage': '16.23%', 'elapsed_time': '4m 5s', 'remaining_time': '21m 8s'}


Train:  17%|█▋        | 80/462 [04:20<18:07,  2.85s/it]

{'loss': 0.46144013, 'acc': 0.86455011, 'grad_norm': 0.94380099, 'learning_rate': 9.602e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.305957, 'epoch': 0.17, 'global_step/max_steps': '80/462', 'percentage': '17.32%', 'elapsed_time': '4m 20s', 'remaining_time': '20m 43s'}


Train:  18%|█▊        | 85/462 [04:34<17:49,  2.84s/it]

{'loss': 0.48379588, 'acc': 0.86569462, 'grad_norm': 0.72243994, 'learning_rate': 9.529e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.308141, 'epoch': 0.18, 'global_step/max_steps': '85/462', 'percentage': '18.40%', 'elapsed_time': '4m 34s', 'remaining_time': '20m 18s'}


Train:  19%|█▉        | 90/462 [04:50<17:15,  2.78s/it]

{'loss': 0.44568563, 'acc': 0.86443644, 'grad_norm': 0.888924, 'learning_rate': 9.45e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.310443, 'epoch': 0.19, 'global_step/max_steps': '90/462', 'percentage': '19.48%', 'elapsed_time': '4m 50s', 'remaining_time': '20m 1s'}


Train:  21%|██        | 95/462 [05:05<17:56,  2.93s/it]

{'loss': 0.47841311, 'acc': 0.86848812, 'grad_norm': 0.9373464, 'learning_rate': 9.366e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.310338, 'epoch': 0.21, 'global_step/max_steps': '95/462', 'percentage': '20.56%', 'elapsed_time': '5m 5s', 'remaining_time': '19m 39s'}


Train:  22%|██▏       | 100/462 [05:19<17:04,  2.83s/it]

{'loss': 0.42688527, 'acc': 0.87530088, 'grad_norm': 0.9090054, 'learning_rate': 9.275e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.311986, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '5m 19s', 'remaining_time': '19m 15s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.15it/s]/it]


{'eval_loss': 0.46564874, 'eval_acc': 0.86226008, 'eval_runtime': 3.6115, 'eval_samples_per_second': 20.49, 'eval_steps_per_second': 20.49, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '5m 23s', 'remaining_time': '19m 29s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-100
Train:  23%|██▎       | 105/462 [05:37<18:34,  3.12s/it]

{'loss': 0.43544521, 'acc': 0.87695875, 'grad_norm': 0.79864532, 'learning_rate': 9.18e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.309579, 'epoch': 0.23, 'global_step/max_steps': '105/462', 'percentage': '22.73%', 'elapsed_time': '5m 37s', 'remaining_time': '19m 8s'}


Train:  24%|██▍       | 110/462 [05:52<16:41,  2.85s/it]

{'loss': 0.46713777, 'acc': 0.86941576, 'grad_norm': 0.84010398, 'learning_rate': 9.079e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.311455, 'epoch': 0.24, 'global_step/max_steps': '110/462', 'percentage': '23.81%', 'elapsed_time': '5m 52s', 'remaining_time': '18m 46s'}


Train:  25%|██▍       | 115/462 [06:06<16:06,  2.78s/it]

{'loss': 0.48105063, 'acc': 0.86585875, 'grad_norm': 0.8928982, 'learning_rate': 8.972e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.313175, 'epoch': 0.25, 'global_step/max_steps': '115/462', 'percentage': '24.89%', 'elapsed_time': '6m 6s', 'remaining_time': '18m 24s'}


Train:  26%|██▌       | 120/462 [06:20<15:56,  2.80s/it]

{'loss': 0.46342454, 'acc': 0.87158241, 'grad_norm': 0.87573981, 'learning_rate': 8.861e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.314597, 'epoch': 0.26, 'global_step/max_steps': '120/462', 'percentage': '25.97%', 'elapsed_time': '6m 20s', 'remaining_time': '18m 3s'}


Train:  27%|██▋       | 125/462 [06:34<15:37,  2.78s/it]

{'loss': 0.49233608, 'acc': 0.86489944, 'grad_norm': 1.0534817, 'learning_rate': 8.744e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.316083, 'epoch': 0.27, 'global_step/max_steps': '125/462', 'percentage': '27.06%', 'elapsed_time': '6m 34s', 'remaining_time': '17m 42s'}


Train:  28%|██▊       | 130/462 [06:48<15:22,  2.78s/it]

{'loss': 0.4039556, 'acc': 0.88824272, 'grad_norm': 0.7858181, 'learning_rate': 8.623e-05, 'memory(GiB)': 2.94, 'train_speed(iter/s)': 0.317472, 'epoch': 0.28, 'global_step/max_steps': '130/462', 'percentage': '28.14%', 'elapsed_time': '6m 48s', 'remaining_time': '17m 22s'}


Train:  29%|██▉       | 135/462 [07:02<15:03,  2.76s/it]

{'loss': 0.45524631, 'acc': 0.86819143, 'grad_norm': 0.79240441, 'learning_rate': 8.497e-05, 'memory(GiB)': 3.93, 'train_speed(iter/s)': 0.31884, 'epoch': 0.29, 'global_step/max_steps': '135/462', 'percentage': '29.22%', 'elapsed_time': '7m 2s', 'remaining_time': '17m 2s'}


Train:  30%|███       | 140/462 [07:16<14:51,  2.77s/it]

{'loss': 0.45018353, 'acc': 0.87703114, 'grad_norm': 0.74190682, 'learning_rate': 8.367e-05, 'memory(GiB)': 3.93, 'train_speed(iter/s)': 0.320081, 'epoch': 0.3, 'global_step/max_steps': '140/462', 'percentage': '30.30%', 'elapsed_time': '7m 16s', 'remaining_time': '16m 43s'}


Train:  31%|███▏      | 145/462 [07:30<14:44,  2.79s/it]

{'loss': 0.46508842, 'acc': 0.86955166, 'grad_norm': 1.00345099, 'learning_rate': 8.232e-05, 'memory(GiB)': 3.93, 'train_speed(iter/s)': 0.321197, 'epoch': 0.31, 'global_step/max_steps': '145/462', 'percentage': '31.39%', 'elapsed_time': '7m 30s', 'remaining_time': '16m 25s'}


Train:  32%|███▏      | 150/462 [07:45<15:19,  2.95s/it]

{'loss': 0.46658978, 'acc': 0.86609144, 'grad_norm': 0.9078747, 'learning_rate': 8.093e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.321369, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 45s', 'remaining_time': '16m 8s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.27it/s]/it]


{'eval_loss': 0.45862415, 'eval_acc': 0.86267064, 'eval_runtime': 3.6041, 'eval_samples_per_second': 20.532, 'eval_steps_per_second': 20.532, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 49s', 'remaining_time': '16m 16s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-150
Train:  34%|███▎      | 155/462 [08:04<16:09,  3.16s/it]

{'loss': 0.41232243, 'acc': 0.88226328, 'grad_norm': 0.77657998, 'learning_rate': 7.951e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.319317, 'epoch': 0.34, 'global_step/max_steps': '155/462', 'percentage': '33.55%', 'elapsed_time': '8m 4s', 'remaining_time': '15m 58s'}


Train:  35%|███▍      | 160/462 [08:18<14:23,  2.86s/it]

{'loss': 0.4577868, 'acc': 0.87565641, 'grad_norm': 0.86099184, 'learning_rate': 7.804e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.320271, 'epoch': 0.35, 'global_step/max_steps': '160/462', 'percentage': '34.63%', 'elapsed_time': '8m 18s', 'remaining_time': '15m 40s'}


Train:  36%|███▌      | 165/462 [08:32<14:02,  2.84s/it]

{'loss': 0.45356698, 'acc': 0.86931543, 'grad_norm': 0.80427647, 'learning_rate': 7.654e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.321122, 'epoch': 0.36, 'global_step/max_steps': '165/462', 'percentage': '35.71%', 'elapsed_time': '8m 32s', 'remaining_time': '15m 22s'}


Train:  37%|███▋      | 170/462 [08:46<13:38,  2.80s/it]

{'loss': 0.44441433, 'acc': 0.87875233, 'grad_norm': 0.71877062, 'learning_rate': 7.5e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.322043, 'epoch': 0.37, 'global_step/max_steps': '170/462', 'percentage': '36.80%', 'elapsed_time': '8m 46s', 'remaining_time': '15m 4s'}


Train:  38%|███▊      | 175/462 [09:00<13:17,  2.78s/it]

{'loss': 0.49264894, 'acc': 0.86412764, 'grad_norm': 0.81035668, 'learning_rate': 7.343e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.322968, 'epoch': 0.38, 'global_step/max_steps': '175/462', 'percentage': '37.88%', 'elapsed_time': '9m 0s', 'remaining_time': '14m 46s'}


Train:  39%|███▉      | 180/462 [09:15<13:22,  2.85s/it]

{'loss': 0.48128438, 'acc': 0.86295261, 'grad_norm': 0.86677521, 'learning_rate': 7.183e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.323573, 'epoch': 0.39, 'global_step/max_steps': '180/462', 'percentage': '38.96%', 'elapsed_time': '9m 15s', 'remaining_time': '14m 29s'}


Train:  40%|████      | 185/462 [09:29<13:04,  2.83s/it]

{'loss': 0.4829803, 'acc': 0.86565599, 'grad_norm': 0.94713742, 'learning_rate': 7.021e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.324235, 'epoch': 0.4, 'global_step/max_steps': '185/462', 'percentage': '40.04%', 'elapsed_time': '9m 29s', 'remaining_time': '14m 12s'}


Train:  41%|████      | 190/462 [09:43<12:40,  2.80s/it]

{'loss': 0.41784925, 'acc': 0.88546629, 'grad_norm': 0.89966154, 'learning_rate': 6.855e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.324981, 'epoch': 0.41, 'global_step/max_steps': '190/462', 'percentage': '41.13%', 'elapsed_time': '9m 43s', 'remaining_time': '13m 55s'}


Train:  42%|████▏     | 195/462 [09:57<12:21,  2.78s/it]

{'loss': 0.4338788, 'acc': 0.87502069, 'grad_norm': 0.77161109, 'learning_rate': 6.688e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.325747, 'epoch': 0.42, 'global_step/max_steps': '195/462', 'percentage': '42.21%', 'elapsed_time': '9m 57s', 'remaining_time': '13m 37s'}


Train:  43%|████▎     | 200/462 [10:11<12:06,  2.77s/it]

{'loss': 0.43451471, 'acc': 0.879953, 'grad_norm': 0.94900841, 'learning_rate': 6.518e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.326435, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '10m 11s', 'remaining_time': '13m 20s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.64it/s]/it]


{'eval_loss': 0.45454073, 'eval_acc': 0.86359438, 'eval_runtime': 3.5396, 'eval_samples_per_second': 20.906, 'eval_steps_per_second': 20.906, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '10m 15s', 'remaining_time': '13m 25s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-200
Train:  44%|████▍     | 205/462 [10:30<13:24,  3.13s/it]

{'loss': 0.43486443, 'acc': 0.8744298, 'grad_norm': 0.81035799, 'learning_rate': 6.346e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.324604, 'epoch': 0.44, 'global_step/max_steps': '205/462', 'percentage': '44.37%', 'elapsed_time': '10m 30s', 'remaining_time': '13m 10s'}


Train:  45%|████▌     | 210/462 [10:44<12:05,  2.88s/it]

{'loss': 0.45451179, 'acc': 0.86837101, 'grad_norm': 0.85172153, 'learning_rate': 6.172e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.325196, 'epoch': 0.45, 'global_step/max_steps': '210/462', 'percentage': '45.45%', 'elapsed_time': '10m 44s', 'remaining_time': '12m 53s'}


Train:  47%|████▋     | 215/462 [10:58<11:33,  2.81s/it]

{'loss': 0.43858304, 'acc': 0.87574587, 'grad_norm': 0.84444386, 'learning_rate': 5.997e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.32583, 'epoch': 0.46, 'global_step/max_steps': '215/462', 'percentage': '46.54%', 'elapsed_time': '10m 58s', 'remaining_time': '12m 36s'}


Train:  48%|████▊     | 220/462 [11:12<11:08,  2.76s/it]

{'loss': 0.42369251, 'acc': 0.88056555, 'grad_norm': 0.86249882, 'learning_rate': 5.821e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.326541, 'epoch': 0.48, 'global_step/max_steps': '220/462', 'percentage': '47.62%', 'elapsed_time': '11m 12s', 'remaining_time': '12m 19s'}


Train:  49%|████▊     | 225/462 [11:26<10:58,  2.78s/it]

{'loss': 0.46278057, 'acc': 0.87239656, 'grad_norm': 0.6562593, 'learning_rate': 5.644e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.327155, 'epoch': 0.49, 'global_step/max_steps': '225/462', 'percentage': '48.70%', 'elapsed_time': '11m 26s', 'remaining_time': '12m 3s'}


Train:  50%|████▉     | 230/462 [11:41<11:05,  2.87s/it]

{'loss': 0.45831828, 'acc': 0.87466955, 'grad_norm': 0.89374644, 'learning_rate': 5.466e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.327408, 'epoch': 0.5, 'global_step/max_steps': '230/462', 'percentage': '49.78%', 'elapsed_time': '11m 41s', 'remaining_time': '11m 47s'}


Train:  51%|█████     | 235/462 [11:55<10:34,  2.79s/it]

{'loss': 0.47934237, 'acc': 0.86449909, 'grad_norm': 0.90054852, 'learning_rate': 5.287e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.32799, 'epoch': 0.51, 'global_step/max_steps': '235/462', 'percentage': '50.87%', 'elapsed_time': '11m 55s', 'remaining_time': '11m 30s'}


Train:  52%|█████▏    | 240/462 [12:09<10:15,  2.77s/it]

{'loss': 0.41099205, 'acc': 0.88262863, 'grad_norm': 0.80421811, 'learning_rate': 5.108e-05, 'memory(GiB)': 4.96, 'train_speed(iter/s)': 0.328508, 'epoch': 0.52, 'global_step/max_steps': '240/462', 'percentage': '51.95%', 'elapsed_time': '12m 9s', 'remaining_time': '11m 14s'}


Train:  53%|█████▎    | 245/462 [12:24<10:03,  2.78s/it]

{'loss': 0.41155672, 'acc': 0.88486528, 'grad_norm': 0.82848525, 'learning_rate': 4.928e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.329004, 'epoch': 0.53, 'global_step/max_steps': '245/462', 'percentage': '53.03%', 'elapsed_time': '12m 24s', 'remaining_time': '10m 59s'}


Train:  54%|█████▍    | 250/462 [12:38<09:58,  2.82s/it]

{'loss': 0.4293654, 'acc': 0.87973824, 'grad_norm': 0.8676607, 'learning_rate': 4.749e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.329255, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 38s', 'remaining_time': '10m 43s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 15.33it/s]/it]


{'eval_loss': 0.45267919, 'eval_acc': 0.86441548, 'eval_runtime': 3.545, 'eval_samples_per_second': 20.874, 'eval_steps_per_second': 20.874, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 43s', 'remaining_time': '10m 47s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-250
Train:  55%|█████▌    | 255/462 [13:01<11:54,  3.45s/it]

{'loss': 0.41384778, 'acc': 0.88330288, 'grad_norm': 0.89736456, 'learning_rate': 4.57e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.325813, 'epoch': 0.55, 'global_step/max_steps': '255/462', 'percentage': '55.19%', 'elapsed_time': '13m 1s', 'remaining_time': '10m 34s'}


Train:  56%|█████▋    | 260/462 [13:16<09:50,  2.92s/it]

{'loss': 0.44874706, 'acc': 0.87440767, 'grad_norm': 0.73983675, 'learning_rate': 4.392e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326201, 'epoch': 0.56, 'global_step/max_steps': '260/462', 'percentage': '56.28%', 'elapsed_time': '13m 16s', 'remaining_time': '10m 19s'}


Train:  57%|█████▋    | 265/462 [13:31<09:25,  2.87s/it]

{'loss': 0.43684888, 'acc': 0.87140226, 'grad_norm': 0.82314706, 'learning_rate': 4.214e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326302, 'epoch': 0.57, 'global_step/max_steps': '265/462', 'percentage': '57.36%', 'elapsed_time': '13m 31s', 'remaining_time': '10m 3s'}


Train:  58%|█████▊    | 270/462 [13:45<09:03,  2.83s/it]

{'loss': 0.41799417, 'acc': 0.88107004, 'grad_norm': 0.78966284, 'learning_rate': 4.038e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326542, 'epoch': 0.58, 'global_step/max_steps': '270/462', 'percentage': '58.44%', 'elapsed_time': '13m 45s', 'remaining_time': '9m 47s'}


Train:  60%|█████▉    | 275/462 [14:00<08:44,  2.80s/it]

{'loss': 0.49037638, 'acc': 0.86397104, 'grad_norm': 0.92960393, 'learning_rate': 3.862e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.32695, 'epoch': 0.59, 'global_step/max_steps': '275/462', 'percentage': '59.52%', 'elapsed_time': '14m 0s', 'remaining_time': '9m 31s'}


Train:  61%|██████    | 280/462 [14:18<08:42,  2.87s/it]

{'loss': 0.45247407, 'acc': 0.86912947, 'grad_norm': 0.9832198, 'learning_rate': 3.689e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.327044, 'epoch': 0.61, 'global_step/max_steps': '280/462', 'percentage': '60.61%', 'elapsed_time': '14m 18s', 'remaining_time': '9m 18s'}


Train:  62%|██████▏   | 285/462 [14:33<09:07,  3.09s/it]

{'loss': 0.39554353, 'acc': 0.88429337, 'grad_norm': 0.83830005, 'learning_rate': 3.516e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326069, 'epoch': 0.62, 'global_step/max_steps': '285/462', 'percentage': '61.69%', 'elapsed_time': '14m 33s', 'remaining_time': '9m 2s'}


Train:  63%|██████▎   | 290/462 [14:47<08:15,  2.88s/it]

{'loss': 0.41882849, 'acc': 0.87370663, 'grad_norm': 0.83813149, 'learning_rate': 3.346e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326353, 'epoch': 0.63, 'global_step/max_steps': '290/462', 'percentage': '62.77%', 'elapsed_time': '14m 47s', 'remaining_time': '8m 46s'}


Train:  64%|██████▍   | 295/462 [15:02<07:57,  2.86s/it]

{'loss': 0.42070322, 'acc': 0.88504257, 'grad_norm': 0.79871434, 'learning_rate': 3.178e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326612, 'epoch': 0.64, 'global_step/max_steps': '295/462', 'percentage': '63.85%', 'elapsed_time': '15m 2s', 'remaining_time': '8m 30s'}


Train:  65%|██████▍   | 300/462 [15:16<07:36,  2.82s/it]

{'loss': 0.45554323, 'acc': 0.86995192, 'grad_norm': 0.83433414, 'learning_rate': 3.012e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326953, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '15m 16s', 'remaining_time': '8m 14s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 14.93it/s]/it]


{'eval_loss': 0.44948393, 'eval_acc': 0.86410756, 'eval_runtime': 3.5634, 'eval_samples_per_second': 20.767, 'eval_steps_per_second': 20.767, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '15m 21s', 'remaining_time': '8m 17s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-300
Train:  66%|██████▌   | 305/462 [15:37<08:45,  3.35s/it]

{'loss': 0.45805297, 'acc': 0.87442236, 'grad_norm': 0.80275017, 'learning_rate': 2.849e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.324743, 'epoch': 0.66, 'global_step/max_steps': '305/462', 'percentage': '66.02%', 'elapsed_time': '15m 37s', 'remaining_time': '8m 2s'}


Train:  67%|██████▋   | 310/462 [15:52<07:21,  2.90s/it]

{'loss': 0.39276385, 'acc': 0.88942699, 'grad_norm': 0.79138041, 'learning_rate': 2.689e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.325167, 'epoch': 0.67, 'global_step/max_steps': '310/462', 'percentage': '67.10%', 'elapsed_time': '15m 52s', 'remaining_time': '7m 46s'}


Train:  68%|██████▊   | 315/462 [16:06<06:55,  2.83s/it]

{'loss': 0.44892654, 'acc': 0.87609653, 'grad_norm': 0.77245605, 'learning_rate': 2.531e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.325574, 'epoch': 0.68, 'global_step/max_steps': '315/462', 'percentage': '68.18%', 'elapsed_time': '16m 6s', 'remaining_time': '7m 30s'}


Train:  69%|██████▉   | 320/462 [16:20<06:46,  2.86s/it]

{'loss': 0.45102978, 'acc': 0.87379656, 'grad_norm': 0.72597778, 'learning_rate': 2.377e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.32589, 'epoch': 0.69, 'global_step/max_steps': '320/462', 'percentage': '69.26%', 'elapsed_time': '16m 20s', 'remaining_time': '7m 15s'}


Train:  70%|███████   | 325/462 [16:35<06:32,  2.86s/it]

{'loss': 0.42847252, 'acc': 0.87621737, 'grad_norm': 1.03951418, 'learning_rate': 2.226e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326186, 'epoch': 0.7, 'global_step/max_steps': '325/462', 'percentage': '70.35%', 'elapsed_time': '16m 35s', 'remaining_time': '6m 59s'}


Train:  71%|███████▏  | 330/462 [16:49<06:15,  2.84s/it]

{'loss': 0.44293671, 'acc': 0.87288494, 'grad_norm': 0.89350039, 'learning_rate': 2.079e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326519, 'epoch': 0.71, 'global_step/max_steps': '330/462', 'percentage': '71.43%', 'elapsed_time': '16m 49s', 'remaining_time': '6m 43s'}


Train:  73%|███████▎  | 335/462 [17:03<06:01,  2.85s/it]

{'loss': 0.4648181, 'acc': 0.86981449, 'grad_norm': 0.93625551, 'learning_rate': 1.935e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.326829, 'epoch': 0.72, 'global_step/max_steps': '335/462', 'percentage': '72.51%', 'elapsed_time': '17m 3s', 'remaining_time': '6m 28s'}


Train:  74%|███████▎  | 340/462 [17:17<05:41,  2.80s/it]

{'loss': 0.38564885, 'acc': 0.8882205, 'grad_norm': 0.79711336, 'learning_rate': 1.795e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.32722, 'epoch': 0.74, 'global_step/max_steps': '340/462', 'percentage': '73.59%', 'elapsed_time': '17m 17s', 'remaining_time': '6m 12s'}


Train:  75%|███████▍  | 345/462 [17:32<05:29,  2.82s/it]

{'loss': 0.45106816, 'acc': 0.8776475, 'grad_norm': 0.93513364, 'learning_rate': 1.66e-05, 'memory(GiB)': 5.99, 'train_speed(iter/s)': 0.327555, 'epoch': 0.75, 'global_step/max_steps': '345/462', 'percentage': '74.68%', 'elapsed_time': '17m 31s', 'remaining_time': '5m 56s'}


Train:  76%|███████▌  | 350/462 [17:46<05:24,  2.90s/it]

{'loss': 0.41372476, 'acc': 0.87605734, 'grad_norm': 0.8170594, 'learning_rate': 1.528e-05, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.327749, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '17m 46s', 'remaining_time': '5m 41s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.31it/s]/it]


{'eval_loss': 0.44942757, 'eval_acc': 0.86441548, 'eval_runtime': 3.6862, 'eval_samples_per_second': 20.075, 'eval_steps_per_second': 20.075, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '17m 50s', 'remaining_time': '5m 42s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-350
Train:  77%|███████▋  | 355/462 [18:07<06:17,  3.53s/it]

{'loss': 0.42562232, 'acc': 0.878335, 'grad_norm': 0.8670544, 'learning_rate': 1.402e-05, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.32614, 'epoch': 0.77, 'global_step/max_steps': '355/462', 'percentage': '76.84%', 'elapsed_time': '18m 7s', 'remaining_time': '5m 27s'}


Train:  78%|███████▊  | 360/462 [18:23<05:24,  3.18s/it]

{'loss': 0.45272636, 'acc': 0.86888514, 'grad_norm': 0.86007923, 'learning_rate': 1.279e-05, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.32592, 'epoch': 0.78, 'global_step/max_steps': '360/462', 'percentage': '77.92%', 'elapsed_time': '18m 23s', 'remaining_time': '5m 12s'}


Train:  79%|███████▉  | 365/462 [18:37<04:35,  2.84s/it]

{'loss': 0.47216077, 'acc': 0.86881132, 'grad_norm': 0.84929258, 'learning_rate': 1.162e-05, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.326319, 'epoch': 0.79, 'global_step/max_steps': '365/462', 'percentage': '79.00%', 'elapsed_time': '18m 37s', 'remaining_time': '4m 56s'}


Train:  80%|████████  | 370/462 [18:51<04:15,  2.78s/it]

{'loss': 0.39788301, 'acc': 0.88545036, 'grad_norm': 0.85750693, 'learning_rate': 1.05e-05, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.32671, 'epoch': 0.8, 'global_step/max_steps': '370/462', 'percentage': '80.09%', 'elapsed_time': '18m 51s', 'remaining_time': '4m 41s'}


Train:  81%|████████  | 375/462 [19:05<04:01,  2.77s/it]

{'loss': 0.41152182, 'acc': 0.88407459, 'grad_norm': 0.77784276, 'learning_rate': 9.42e-06, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.327089, 'epoch': 0.81, 'global_step/max_steps': '375/462', 'percentage': '81.17%', 'elapsed_time': '19m 5s', 'remaining_time': '4m 25s'}


Train:  82%|████████▏ | 380/462 [19:19<03:47,  2.77s/it]

{'loss': 0.41543274, 'acc': 0.88494864, 'grad_norm': 0.76440758, 'learning_rate': 8.4e-06, 'memory(GiB)': 7.04, 'train_speed(iter/s)': 0.327478, 'epoch': 0.82, 'global_step/max_steps': '380/462', 'percentage': '82.25%', 'elapsed_time': '19m 19s', 'remaining_time': '4m 10s'}


Train:  83%|████████▎ | 385/462 [19:33<03:33,  2.77s/it]

{'loss': 0.41569796, 'acc': 0.88099747, 'grad_norm': 0.88810503, 'learning_rate': 7.43e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.327851, 'epoch': 0.83, 'global_step/max_steps': '385/462', 'percentage': '83.33%', 'elapsed_time': '19m 33s', 'remaining_time': '3m 54s'}


Train:  84%|████████▍ | 390/462 [19:47<03:19,  2.77s/it]

{'loss': 0.43997359, 'acc': 0.87218256, 'grad_norm': 0.82061017, 'learning_rate': 6.52e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.328179, 'epoch': 0.84, 'global_step/max_steps': '390/462', 'percentage': '84.42%', 'elapsed_time': '19m 47s', 'remaining_time': '3m 39s'}


Train:  85%|████████▌ | 395/462 [20:01<03:06,  2.78s/it]

{'loss': 0.41263394, 'acc': 0.87784357, 'grad_norm': 0.66379392, 'learning_rate': 5.66e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.328499, 'epoch': 0.85, 'global_step/max_steps': '395/462', 'percentage': '85.50%', 'elapsed_time': '20m 1s', 'remaining_time': '3m 23s'}


Train:  87%|████████▋ | 400/462 [20:15<02:51,  2.77s/it]

{'loss': 0.43517132, 'acc': 0.87868872, 'grad_norm': 0.7541675, 'learning_rate': 4.86e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.32884, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '20m 15s', 'remaining_time': '3m 8s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.55it/s]/it]


{'eval_loss': 0.44902447, 'eval_acc': 0.8650313, 'eval_runtime': 3.5999, 'eval_samples_per_second': 20.556, 'eval_steps_per_second': 20.556, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '20m 18s', 'remaining_time': '3m 8s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-400
Train:  88%|████████▊ | 405/462 [20:33<02:56,  3.10s/it]

{'loss': 0.42072554, 'acc': 0.88055868, 'grad_norm': 0.81293446, 'learning_rate': 4.12e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.327954, 'epoch': 0.88, 'global_step/max_steps': '405/462', 'percentage': '87.66%', 'elapsed_time': '20m 33s', 'remaining_time': '2m 53s'}


Train:  89%|████████▊ | 410/462 [20:47<02:28,  2.85s/it]

{'loss': 0.43501506, 'acc': 0.87733364, 'grad_norm': 0.84120256, 'learning_rate': 3.44e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.328252, 'epoch': 0.89, 'global_step/max_steps': '410/462', 'percentage': '88.74%', 'elapsed_time': '20m 47s', 'remaining_time': '2m 38s'}


Train:  90%|████████▉ | 415/462 [21:01<02:11,  2.80s/it]

{'loss': 0.44037571, 'acc': 0.87648678, 'grad_norm': 0.88391101, 'learning_rate': 2.81e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.328541, 'epoch': 0.9, 'global_step/max_steps': '415/462', 'percentage': '89.83%', 'elapsed_time': '21m 1s', 'remaining_time': '2m 22s'}


Train:  91%|█████████ | 420/462 [21:15<01:57,  2.80s/it]

{'loss': 0.41836319, 'acc': 0.88281155, 'grad_norm': 0.82025713, 'learning_rate': 2.25e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.328836, 'epoch': 0.91, 'global_step/max_steps': '420/462', 'percentage': '90.91%', 'elapsed_time': '21m 15s', 'remaining_time': '2m 7s'}


Train:  92%|█████████▏| 425/462 [21:29<01:42,  2.78s/it]

{'loss': 0.44686894, 'acc': 0.87442207, 'grad_norm': 0.9092629, 'learning_rate': 1.75e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.329156, 'epoch': 0.92, 'global_step/max_steps': '425/462', 'percentage': '91.99%', 'elapsed_time': '21m 29s', 'remaining_time': '1m 52s'}


Train:  93%|█████████▎| 430/462 [21:43<01:28,  2.77s/it]

{'loss': 0.49413586, 'acc': 0.86436834, 'grad_norm': 0.88411641, 'learning_rate': 1.31e-06, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.329455, 'epoch': 0.93, 'global_step/max_steps': '430/462', 'percentage': '93.07%', 'elapsed_time': '21m 43s', 'remaining_time': '1m 37s'}


Train:  94%|█████████▍| 435/462 [21:57<01:15,  2.80s/it]

{'loss': 0.44494367, 'acc': 0.86962299, 'grad_norm': 0.82634658, 'learning_rate': 9.3e-07, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.329735, 'epoch': 0.94, 'global_step/max_steps': '435/462', 'percentage': '94.16%', 'elapsed_time': '21m 57s', 'remaining_time': '1m 21s'}


Train:  95%|█████████▌| 440/462 [22:12<01:01,  2.79s/it]

{'loss': 0.48167076, 'acc': 0.86775751, 'grad_norm': 0.89959383, 'learning_rate': 6.2e-07, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.330019, 'epoch': 0.95, 'global_step/max_steps': '440/462', 'percentage': '95.24%', 'elapsed_time': '22m 11s', 'remaining_time': '1m 6s'}


Train:  96%|█████████▋| 445/462 [22:26<00:47,  2.78s/it]

{'loss': 0.42936602, 'acc': 0.87807789, 'grad_norm': 0.87200803, 'learning_rate': 3.7e-07, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.330291, 'epoch': 0.96, 'global_step/max_steps': '445/462', 'percentage': '96.32%', 'elapsed_time': '22m 26s', 'remaining_time': '51s'}


Train:  97%|█████████▋| 450/462 [22:40<00:33,  2.78s/it]

{'loss': 0.4344192, 'acc': 0.88379622, 'grad_norm': 0.79209602, 'learning_rate': 1.9e-07, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.330564, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '22m 40s', 'remaining_time': '36s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.51it/s]/it]


{'eval_loss': 0.44871092, 'eval_acc': 0.86544186, 'eval_runtime': 3.5688, 'eval_samples_per_second': 20.735, 'eval_steps_per_second': 20.735, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '22m 43s', 'remaining_time': '36s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-450
Train:  98%|█████████▊| 455/462 [22:58<00:21,  3.11s/it]

{'loss': 0.42834392, 'acc': 0.8773242, 'grad_norm': 0.83610994, 'learning_rate': 6e-08, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.329722, 'epoch': 0.98, 'global_step/max_steps': '455/462', 'percentage': '98.48%', 'elapsed_time': '22m 58s', 'remaining_time': '21s'}


Train: 100%|█████████▉| 460/462 [23:12<00:05,  2.86s/it]

{'loss': 0.44826298, 'acc': 0.87240105, 'grad_norm': 0.83181483, 'learning_rate': 1e-08, 'memory(GiB)': 8.12, 'train_speed(iter/s)': 0.329931, 'epoch': 0.99, 'global_step/max_steps': '460/462', 'percentage': '99.57%', 'elapsed_time': '23m 12s', 'remaining_time': '6s'}


Train: 100%|██████████| 462/462 [23:18<00:00,  2.84s/it]
Val: 100%|██████████| 74/74 [00:03<00:00, 20.96it/s]/it]


{'eval_loss': 0.44809243, 'eval_acc': 0.86513394, 'eval_runtime': 3.541, 'eval_samples_per_second': 20.898, 'eval_steps_per_second': 20.898, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '23m 22s', 'remaining_time': '0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-462
Train: 100%|██████████| 462/462 [23:23<00:00,  3.04s/it]
[INFO:swift] last_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-462
[INFO:swift] best_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/checkpoint-462
[INFO:swift] images_dir: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b/v0-20241209-200154/images


{'train_runtime': 1403.3955, 'train_samples_per_second': 5.272, 'train_steps_per_second': 0.329, 'train_loss': 0.45906963, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '23m 23s', 'remaining_time': '0s'}


[INFO:swift] End time of running main: 2024-12-09 20:25:58.669333


In [None]:
model_type = ModelType.qwen2_5_0_5b_instruct
sft_args = SftArguments(
    model_type=model_type,
    dataset=['gsm8k-train.jsonl'],
    output_dir='/root/autodl-tmp/checkpoint/math_output/lora',
    max_length=4096)
result = sft_main(sft_args)

[INFO:swift] Setting template_type: qwen2_5
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1
[INFO:swift] output_dir: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429
[INFO:swift] Start time of running main: 2024-11-25 23:24:29.426308
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b-instruct', model_id_or_path='qwen/Qwen2.5-0.5B-Instruct', model_revision='master', full_determinism=False, sft_type='lora', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='qwen2_5', output_dir='/root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint=None, resume_only_model=False, ignore_data_skip=False, dtype='bf16', packing=False, train_backend='t

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1


[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 32768
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "

{'loss': 0.61197513, 'acc': 0.85235035, 'grad_norm': 3.15869617, 'learning_rate': 4.17e-06, 'memory(GiB)': 3.21, 'train_speed(iter/s)': 0.202167, 'epoch': 0.0, 'global_step/max_steps': '1/462', 'percentage': '0.22%', 'elapsed_time': '4s', 'remaining_time': '35m 29s'}


Train:   1%|          | 5/462 [00:15<22:23,  2.94s/it]

{'loss': 0.68376583, 'acc': 0.82469958, 'grad_norm': 2.84423923, 'learning_rate': 2.083e-05, 'memory(GiB)': 6.4, 'train_speed(iter/s)': 0.307612, 'epoch': 0.01, 'global_step/max_steps': '5/462', 'percentage': '1.08%', 'elapsed_time': '15s', 'remaining_time': '24m 13s'}


Train:   2%|▏         | 10/462 [00:29<21:07,  2.80s/it]

{'loss': 0.65204229, 'acc': 0.83305178, 'grad_norm': 2.19942641, 'learning_rate': 4.167e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.330452, 'epoch': 0.02, 'global_step/max_steps': '10/462', 'percentage': '2.16%', 'elapsed_time': '29s', 'remaining_time': '22m 32s'}


Train:   3%|▎         | 15/462 [00:43<20:44,  2.78s/it]

{'loss': 0.61694822, 'acc': 0.84145708, 'grad_norm': 2.2097621, 'learning_rate': 6.25e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.339328, 'epoch': 0.03, 'global_step/max_steps': '15/462', 'percentage': '3.25%', 'elapsed_time': '43s', 'remaining_time': '21m 47s'}


Train:   4%|▍         | 20/462 [00:57<20:18,  2.76s/it]

{'loss': 0.58666663, 'acc': 0.8430233, 'grad_norm': 1.22232699, 'learning_rate': 8.333e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.344239, 'epoch': 0.04, 'global_step/max_steps': '20/462', 'percentage': '4.33%', 'elapsed_time': '57s', 'remaining_time': '21m 16s'}


Train:   5%|▌         | 25/462 [01:11<20:08,  2.77s/it]

{'loss': 0.53799725, 'acc': 0.85153818, 'grad_norm': 0.98092151, 'learning_rate': 0.0001, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.346866, 'epoch': 0.05, 'global_step/max_steps': '25/462', 'percentage': '5.41%', 'elapsed_time': '1m 11s', 'remaining_time': '20m 53s'}


Train:   6%|▋         | 30/462 [01:25<20:10,  2.80s/it]

{'loss': 0.54924064, 'acc': 0.84976196, 'grad_norm': 1.02524292, 'learning_rate': 9.995e-05, 'memory(GiB)': 8.56, 'train_speed(iter/s)': 0.348136, 'epoch': 0.06, 'global_step/max_steps': '30/462', 'percentage': '6.49%', 'elapsed_time': '1m 25s', 'remaining_time': '20m 37s'}


Train:   8%|▊         | 35/462 [01:39<19:50,  2.79s/it]

{'loss': 0.47256923, 'acc': 0.86476908, 'grad_norm': 0.80619228, 'learning_rate': 9.984e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.34919, 'epoch': 0.08, 'global_step/max_steps': '35/462', 'percentage': '7.58%', 'elapsed_time': '1m 39s', 'remaining_time': '20m 18s'}


Train:   9%|▊         | 40/462 [01:53<19:33,  2.78s/it]

{'loss': 0.44885225, 'acc': 0.87634258, 'grad_norm': 0.97581089, 'learning_rate': 9.967e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.350191, 'epoch': 0.09, 'global_step/max_steps': '40/462', 'percentage': '8.66%', 'elapsed_time': '1m 53s', 'remaining_time': '20m 1s'}


Train:  10%|▉         | 45/462 [02:08<19:26,  2.80s/it]

{'loss': 0.47702298, 'acc': 0.86619434, 'grad_norm': 1.0663271, 'learning_rate': 9.943e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.350685, 'epoch': 0.1, 'global_step/max_steps': '45/462', 'percentage': '9.74%', 'elapsed_time': '2m 7s', 'remaining_time': '19m 45s'}


Train:  11%|█         | 50/462 [02:21<19:06,  2.78s/it]

{'loss': 0.45771489, 'acc': 0.87052937, 'grad_norm': 0.91966009, 'learning_rate': 9.913e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.351409, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 21s', 'remaining_time': '19m 29s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 18.20it/s]it]


{'eval_loss': 0.47185791, 'eval_acc': 0.86164426, 'eval_runtime': 4.0512, 'eval_samples_per_second': 18.266, 'eval_steps_per_second': 18.266, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 26s', 'remaining_time': '20m 4s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-50
Train:  12%|█▏        | 55/462 [02:41<21:32,  3.18s/it]

{'loss': 0.46087322, 'acc': 0.87051802, 'grad_norm': 1.02117682, 'learning_rate': 9.877e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.340924, 'epoch': 0.12, 'global_step/max_steps': '55/462', 'percentage': '11.90%', 'elapsed_time': '2m 40s', 'remaining_time': '19m 51s'}


Train:  13%|█▎        | 60/462 [02:55<19:00,  2.84s/it]

{'loss': 0.46935225, 'acc': 0.86795778, 'grad_norm': 0.9024871, 'learning_rate': 9.834e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.34226, 'epoch': 0.13, 'global_step/max_steps': '60/462', 'percentage': '12.99%', 'elapsed_time': '2m 55s', 'remaining_time': '19m 32s'}


Train:  14%|█▍        | 65/462 [03:09<18:26,  2.79s/it]

{'loss': 0.44819369, 'acc': 0.87220449, 'grad_norm': 0.94080293, 'learning_rate': 9.785e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.343261, 'epoch': 0.14, 'global_step/max_steps': '65/462', 'percentage': '14.07%', 'elapsed_time': '3m 9s', 'remaining_time': '19m 14s'}


Train:  15%|█▌        | 70/462 [03:23<18:34,  2.84s/it]

{'loss': 0.45091658, 'acc': 0.87489376, 'grad_norm': 0.76448703, 'learning_rate': 9.73e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.343683, 'epoch': 0.15, 'global_step/max_steps': '70/462', 'percentage': '15.15%', 'elapsed_time': '3m 23s', 'remaining_time': '18m 58s'}


Train:  16%|█▌        | 75/462 [03:37<18:14,  2.83s/it]

{'loss': 0.44608216, 'acc': 0.8660264, 'grad_norm': 0.92078537, 'learning_rate': 9.669e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.344105, 'epoch': 0.16, 'global_step/max_steps': '75/462', 'percentage': '16.23%', 'elapsed_time': '3m 37s', 'remaining_time': '18m 42s'}


Train:  17%|█▋        | 80/462 [03:51<17:52,  2.81s/it]

{'loss': 0.44821129, 'acc': 0.87080097, 'grad_norm': 0.94582897, 'learning_rate': 9.602e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.344694, 'epoch': 0.17, 'global_step/max_steps': '80/462', 'percentage': '17.32%', 'elapsed_time': '3m 51s', 'remaining_time': '18m 26s'}


Train:  18%|█▊        | 85/462 [04:05<17:36,  2.80s/it]

{'loss': 0.47013569, 'acc': 0.86901646, 'grad_norm': 0.74878061, 'learning_rate': 9.529e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.345246, 'epoch': 0.18, 'global_step/max_steps': '85/462', 'percentage': '18.40%', 'elapsed_time': '4m 5s', 'remaining_time': '18m 10s'}


Train:  19%|█▉        | 90/462 [04:19<17:17,  2.79s/it]

{'loss': 0.43488936, 'acc': 0.86879435, 'grad_norm': 0.84726042, 'learning_rate': 9.45e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.345882, 'epoch': 0.19, 'global_step/max_steps': '90/462', 'percentage': '19.48%', 'elapsed_time': '4m 19s', 'remaining_time': '17m 54s'}


Train:  21%|██        | 95/462 [04:33<16:59,  2.78s/it]

{'loss': 0.46490049, 'acc': 0.87071428, 'grad_norm': 0.93968445, 'learning_rate': 9.366e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.346499, 'epoch': 0.21, 'global_step/max_steps': '95/462', 'percentage': '20.56%', 'elapsed_time': '4m 33s', 'remaining_time': '17m 37s'}


Train:  22%|██▏       | 100/462 [04:47<16:43,  2.77s/it]

{'loss': 0.41392817, 'acc': 0.87934923, 'grad_norm': 0.89749056, 'learning_rate': 9.275e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.347046, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '4m 47s', 'remaining_time': '17m 21s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 21.14it/s]/it]


{'eval_loss': 0.4542239, 'eval_acc': 0.86626296, 'eval_runtime': 3.5, 'eval_samples_per_second': 21.143, 'eval_steps_per_second': 21.143, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '4m 51s', 'remaining_time': '17m 35s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-100
Train:  23%|██▎       | 105/462 [05:06<18:14,  3.07s/it]

{'loss': 0.42202902, 'acc': 0.88121691, 'grad_norm': 0.84174085, 'learning_rate': 9.18e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.3427, 'epoch': 0.23, 'global_step/max_steps': '105/462', 'percentage': '22.73%', 'elapsed_time': '5m 6s', 'remaining_time': '17m 20s'}


Train:  24%|██▍       | 110/462 [05:20<16:41,  2.85s/it]

{'loss': 0.45957813, 'acc': 0.87050238, 'grad_norm': 0.82862681, 'learning_rate': 9.079e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.343285, 'epoch': 0.24, 'global_step/max_steps': '110/462', 'percentage': '23.81%', 'elapsed_time': '5m 20s', 'remaining_time': '17m 4s'}


Train:  25%|██▍       | 115/462 [05:34<16:31,  2.86s/it]

{'loss': 0.4728725, 'acc': 0.86399555, 'grad_norm': 0.89626223, 'learning_rate': 8.972e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.343477, 'epoch': 0.25, 'global_step/max_steps': '115/462', 'percentage': '24.89%', 'elapsed_time': '5m 34s', 'remaining_time': '16m 49s'}


Train:  26%|██▌       | 120/462 [05:48<15:59,  2.80s/it]

{'loss': 0.44849319, 'acc': 0.8743638, 'grad_norm': 0.8927837, 'learning_rate': 8.861e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.343881, 'epoch': 0.26, 'global_step/max_steps': '120/462', 'percentage': '25.97%', 'elapsed_time': '5m 48s', 'remaining_time': '16m 33s'}


Train:  27%|██▋       | 125/462 [06:02<15:50,  2.82s/it]

{'loss': 0.48100319, 'acc': 0.86610785, 'grad_norm': 0.90798312, 'learning_rate': 8.744e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.344264, 'epoch': 0.27, 'global_step/max_steps': '125/462', 'percentage': '27.06%', 'elapsed_time': '6m 2s', 'remaining_time': '16m 18s'}


Train:  28%|██▊       | 130/462 [06:16<15:26,  2.79s/it]

{'loss': 0.39482653, 'acc': 0.8937439, 'grad_norm': 0.81374145, 'learning_rate': 8.623e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.344776, 'epoch': 0.28, 'global_step/max_steps': '130/462', 'percentage': '28.14%', 'elapsed_time': '6m 16s', 'remaining_time': '16m 2s'}


Train:  29%|██▉       | 135/462 [06:30<15:06,  2.77s/it]

{'loss': 0.44934692, 'acc': 0.87189255, 'grad_norm': 1.0380441, 'learning_rate': 8.497e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.345276, 'epoch': 0.29, 'global_step/max_steps': '135/462', 'percentage': '29.22%', 'elapsed_time': '6m 30s', 'remaining_time': '15m 46s'}


Train:  30%|███       | 140/462 [06:44<14:56,  2.78s/it]

{'loss': 0.44342308, 'acc': 0.87749319, 'grad_norm': 0.76591295, 'learning_rate': 8.367e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.345643, 'epoch': 0.3, 'global_step/max_steps': '140/462', 'percentage': '30.30%', 'elapsed_time': '6m 44s', 'remaining_time': '15m 30s'}


Train:  31%|███▏      | 145/462 [06:58<14:37,  2.77s/it]

{'loss': 0.45470243, 'acc': 0.86910038, 'grad_norm': 0.97608829, 'learning_rate': 8.232e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.346024, 'epoch': 0.31, 'global_step/max_steps': '145/462', 'percentage': '31.39%', 'elapsed_time': '6m 58s', 'remaining_time': '15m 15s'}


Train:  32%|███▏      | 150/462 [07:12<14:21,  2.76s/it]

{'loss': 0.4540978, 'acc': 0.86919689, 'grad_norm': 0.9767282, 'learning_rate': 8.093e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.346508, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 12s', 'remaining_time': '14m 59s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.55it/s]/it]


{'eval_loss': 0.44738784, 'eval_acc': 0.86698142, 'eval_runtime': 3.5597, 'eval_samples_per_second': 20.788, 'eval_steps_per_second': 20.788, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 16s', 'remaining_time': '15m 7s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-150
Train:  34%|███▎      | 155/462 [07:31<15:48,  3.09s/it]

{'loss': 0.39690335, 'acc': 0.8837882, 'grad_norm': 0.7612046, 'learning_rate': 7.951e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.343433, 'epoch': 0.34, 'global_step/max_steps': '155/462', 'percentage': '33.55%', 'elapsed_time': '7m 30s', 'remaining_time': '14m 53s'}


Train:  35%|███▍      | 160/462 [07:45<14:20,  2.85s/it]

{'loss': 0.4511507, 'acc': 0.87748556, 'grad_norm': 0.88686621, 'learning_rate': 7.804e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.343753, 'epoch': 0.35, 'global_step/max_steps': '160/462', 'percentage': '34.63%', 'elapsed_time': '7m 45s', 'remaining_time': '14m 37s'}


Train:  36%|███▌      | 165/462 [07:59<13:53,  2.81s/it]

{'loss': 0.44010935, 'acc': 0.87279558, 'grad_norm': 0.81993908, 'learning_rate': 7.654e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344062, 'epoch': 0.36, 'global_step/max_steps': '165/462', 'percentage': '35.71%', 'elapsed_time': '7m 59s', 'remaining_time': '14m 22s'}


Train:  37%|███▋      | 170/462 [08:13<13:39,  2.81s/it]

{'loss': 0.43185329, 'acc': 0.87980089, 'grad_norm': 0.79096407, 'learning_rate': 7.5e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.34434, 'epoch': 0.37, 'global_step/max_steps': '170/462', 'percentage': '36.80%', 'elapsed_time': '8m 13s', 'remaining_time': '14m 7s'}


Train:  38%|███▊      | 175/462 [08:27<13:14,  2.77s/it]

{'loss': 0.48183436, 'acc': 0.86629438, 'grad_norm': 0.91397375, 'learning_rate': 7.343e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344763, 'epoch': 0.38, 'global_step/max_steps': '175/462', 'percentage': '37.88%', 'elapsed_time': '8m 27s', 'remaining_time': '13m 51s'}


Train:  39%|███▉      | 180/462 [08:41<13:18,  2.83s/it]

{'loss': 0.47169733, 'acc': 0.86861687, 'grad_norm': 0.93128216, 'learning_rate': 7.183e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344975, 'epoch': 0.39, 'global_step/max_steps': '180/462', 'percentage': '38.96%', 'elapsed_time': '8m 41s', 'remaining_time': '13m 36s'}


Train:  40%|████      | 185/462 [08:55<12:54,  2.80s/it]

{'loss': 0.47367945, 'acc': 0.87004833, 'grad_norm': 0.84356594, 'learning_rate': 7.021e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345246, 'epoch': 0.4, 'global_step/max_steps': '185/462', 'percentage': '40.04%', 'elapsed_time': '8m 55s', 'remaining_time': '13m 21s'}


Train:  41%|████      | 190/462 [09:09<12:39,  2.79s/it]

{'loss': 0.40551424, 'acc': 0.88912659, 'grad_norm': 0.9212898, 'learning_rate': 6.855e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345564, 'epoch': 0.41, 'global_step/max_steps': '190/462', 'percentage': '41.13%', 'elapsed_time': '9m 9s', 'remaining_time': '13m 6s'}


Train:  42%|████▏     | 195/462 [09:23<12:15,  2.76s/it]

{'loss': 0.41866422, 'acc': 0.87702904, 'grad_norm': 0.83646154, 'learning_rate': 6.688e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345945, 'epoch': 0.42, 'global_step/max_steps': '195/462', 'percentage': '42.21%', 'elapsed_time': '9m 23s', 'remaining_time': '12m 51s'}


Train:  43%|████▎     | 200/462 [09:37<12:03,  2.76s/it]

{'loss': 0.42139664, 'acc': 0.88415709, 'grad_norm': 1.05838919, 'learning_rate': 6.518e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.346271, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '9m 37s', 'remaining_time': '12m 36s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 21.31it/s]/it]


{'eval_loss': 0.44340181, 'eval_acc': 0.86872627, 'eval_runtime': 3.4747, 'eval_samples_per_second': 21.297, 'eval_steps_per_second': 21.297, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '9m 40s', 'remaining_time': '12m 40s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-200
Train:  44%|████▍     | 205/462 [09:55<13:17,  3.10s/it]

{'loss': 0.42414532, 'acc': 0.87762299, 'grad_norm': 0.8267619, 'learning_rate': 6.346e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.343862, 'epoch': 0.44, 'global_step/max_steps': '205/462', 'percentage': '44.37%', 'elapsed_time': '9m 55s', 'remaining_time': '12m 26s'}


Train:  45%|████▌     | 210/462 [10:09<11:56,  2.84s/it]

{'loss': 0.43905234, 'acc': 0.87239313, 'grad_norm': 0.88912994, 'learning_rate': 6.172e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344119, 'epoch': 0.45, 'global_step/max_steps': '210/462', 'percentage': '45.45%', 'elapsed_time': '10m 9s', 'remaining_time': '12m 11s'}


Train:  47%|████▋     | 215/462 [10:24<11:38,  2.83s/it]

{'loss': 0.42614274, 'acc': 0.87787828, 'grad_norm': 0.8290385, 'learning_rate': 5.997e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344339, 'epoch': 0.46, 'global_step/max_steps': '215/462', 'percentage': '46.54%', 'elapsed_time': '10m 24s', 'remaining_time': '11m 56s'}


Train:  48%|████▊     | 220/462 [10:38<11:12,  2.78s/it]

{'loss': 0.41509504, 'acc': 0.88102503, 'grad_norm': 0.87730086, 'learning_rate': 5.821e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344622, 'epoch': 0.48, 'global_step/max_steps': '220/462', 'percentage': '47.62%', 'elapsed_time': '10m 38s', 'remaining_time': '11m 41s'}


Train:  49%|████▊     | 225/462 [10:52<11:01,  2.79s/it]

{'loss': 0.45513611, 'acc': 0.87567902, 'grad_norm': 0.68040472, 'learning_rate': 5.644e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.344855, 'epoch': 0.49, 'global_step/max_steps': '225/462', 'percentage': '48.70%', 'elapsed_time': '10m 52s', 'remaining_time': '11m 26s'}


Train:  50%|████▉     | 230/462 [11:06<10:49,  2.80s/it]

{'loss': 0.44402838, 'acc': 0.87901001, 'grad_norm': 0.88692886, 'learning_rate': 5.466e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345067, 'epoch': 0.5, 'global_step/max_steps': '230/462', 'percentage': '49.78%', 'elapsed_time': '11m 6s', 'remaining_time': '11m 11s'}


Train:  51%|█████     | 235/462 [11:20<10:33,  2.79s/it]

{'loss': 0.46622286, 'acc': 0.86730671, 'grad_norm': 0.92842913, 'learning_rate': 5.287e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345296, 'epoch': 0.51, 'global_step/max_steps': '235/462', 'percentage': '50.87%', 'elapsed_time': '11m 20s', 'remaining_time': '10m 57s'}


Train:  52%|█████▏    | 240/462 [11:34<10:19,  2.79s/it]

{'loss': 0.39747093, 'acc': 0.88348618, 'grad_norm': 0.84517038, 'learning_rate': 5.108e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.345488, 'epoch': 0.52, 'global_step/max_steps': '240/462', 'percentage': '51.95%', 'elapsed_time': '11m 34s', 'remaining_time': '10m 42s'}


Train:  53%|█████▎    | 245/462 [11:48<10:07,  2.80s/it]

{'loss': 0.40088949, 'acc': 0.88557453, 'grad_norm': 0.78929359, 'learning_rate': 4.928e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.345648, 'epoch': 0.53, 'global_step/max_steps': '245/462', 'percentage': '53.03%', 'elapsed_time': '11m 48s', 'remaining_time': '10m 27s'}


Train:  54%|█████▍    | 250/462 [12:02<09:52,  2.80s/it]

{'loss': 0.4196878, 'acc': 0.87997398, 'grad_norm': 1.03608644, 'learning_rate': 4.749e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.34587, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 2s', 'remaining_time': '10m 12s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.77it/s]/it]


{'eval_loss': 0.44020447, 'eval_acc': 0.86965001, 'eval_runtime': 3.5521, 'eval_samples_per_second': 20.833, 'eval_steps_per_second': 20.833, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 6s', 'remaining_time': '10m 15s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-250
Train:  55%|█████▌    | 255/462 [12:21<10:53,  3.16s/it]

{'loss': 0.40466113, 'acc': 0.88457556, 'grad_norm': 1.01353824, 'learning_rate': 4.57e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.343713, 'epoch': 0.55, 'global_step/max_steps': '255/462', 'percentage': '55.19%', 'elapsed_time': '12m 21s', 'remaining_time': '10m 1s'}


Train:  56%|█████▋    | 260/462 [12:35<09:37,  2.86s/it]

{'loss': 0.43538513, 'acc': 0.87554073, 'grad_norm': 0.76405221, 'learning_rate': 4.392e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.343935, 'epoch': 0.56, 'global_step/max_steps': '260/462', 'percentage': '56.28%', 'elapsed_time': '12m 35s', 'remaining_time': '9m 47s'}


Train:  57%|█████▋    | 265/462 [12:49<09:12,  2.80s/it]

{'loss': 0.42398634, 'acc': 0.87528315, 'grad_norm': 0.89756083, 'learning_rate': 4.214e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344129, 'epoch': 0.57, 'global_step/max_steps': '265/462', 'percentage': '57.36%', 'elapsed_time': '12m 49s', 'remaining_time': '9m 32s'}


Train:  58%|█████▊    | 270/462 [13:03<08:55,  2.79s/it]

{'loss': 0.41076317, 'acc': 0.88450174, 'grad_norm': 0.83872974, 'learning_rate': 4.038e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344344, 'epoch': 0.58, 'global_step/max_steps': '270/462', 'percentage': '58.44%', 'elapsed_time': '13m 3s', 'remaining_time': '9m 17s'}


Train:  60%|█████▉    | 275/462 [13:17<08:37,  2.77s/it]

{'loss': 0.48236232, 'acc': 0.86480303, 'grad_norm': 1.00179327, 'learning_rate': 3.862e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.34461, 'epoch': 0.59, 'global_step/max_steps': '275/462', 'percentage': '59.52%', 'elapsed_time': '13m 17s', 'remaining_time': '9m 2s'}


Train:  61%|██████    | 280/462 [13:31<08:22,  2.76s/it]

{'loss': 0.44441042, 'acc': 0.87482719, 'grad_norm': 1.00647306, 'learning_rate': 3.689e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344853, 'epoch': 0.61, 'global_step/max_steps': '280/462', 'percentage': '60.61%', 'elapsed_time': '13m 31s', 'remaining_time': '8m 47s'}


Train:  62%|██████▏   | 285/462 [13:45<08:09,  2.77s/it]

{'loss': 0.38297296, 'acc': 0.88957491, 'grad_norm': 0.83786893, 'learning_rate': 3.516e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.345071, 'epoch': 0.62, 'global_step/max_steps': '285/462', 'percentage': '61.69%', 'elapsed_time': '13m 45s', 'remaining_time': '8m 32s'}


Train:  63%|██████▎   | 290/462 [13:59<07:57,  2.78s/it]

{'loss': 0.40660067, 'acc': 0.87633524, 'grad_norm': 0.86365932, 'learning_rate': 3.346e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.345296, 'epoch': 0.63, 'global_step/max_steps': '290/462', 'percentage': '62.77%', 'elapsed_time': '13m 59s', 'remaining_time': '8m 17s'}


Train:  64%|██████▍   | 295/462 [14:13<07:44,  2.78s/it]

{'loss': 0.40582957, 'acc': 0.88830137, 'grad_norm': 0.84162724, 'learning_rate': 3.178e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.345502, 'epoch': 0.64, 'global_step/max_steps': '295/462', 'percentage': '63.85%', 'elapsed_time': '14m 13s', 'remaining_time': '8m 3s'}


Train:  65%|██████▍   | 300/462 [14:27<07:29,  2.78s/it]

{'loss': 0.4449903, 'acc': 0.87143526, 'grad_norm': 0.83260709, 'learning_rate': 3.012e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.345687, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '14m 27s', 'remaining_time': '7m 48s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.35it/s]/it]


{'eval_loss': 0.4376891, 'eval_acc': 0.87026583, 'eval_runtime': 3.6171, 'eval_samples_per_second': 20.458, 'eval_steps_per_second': 20.458, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '14m 31s', 'remaining_time': '7m 50s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-300
Train:  66%|██████▌   | 305/462 [14:46<08:07,  3.10s/it]

{'loss': 0.4477201, 'acc': 0.87735186, 'grad_norm': 0.88744503, 'learning_rate': 2.849e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344087, 'epoch': 0.66, 'global_step/max_steps': '305/462', 'percentage': '66.02%', 'elapsed_time': '14m 46s', 'remaining_time': '7m 36s'}


Train:  67%|██████▋   | 310/462 [15:00<07:10,  2.83s/it]

{'loss': 0.38456025, 'acc': 0.89157305, 'grad_norm': 0.86914623, 'learning_rate': 2.689e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344284, 'epoch': 0.67, 'global_step/max_steps': '310/462', 'percentage': '67.10%', 'elapsed_time': '15m 0s', 'remaining_time': '7m 21s'}


Train:  68%|██████▊   | 315/462 [15:14<06:49,  2.79s/it]

{'loss': 0.43926272, 'acc': 0.87863998, 'grad_norm': 0.85795516, 'learning_rate': 2.531e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344499, 'epoch': 0.68, 'global_step/max_steps': '315/462', 'percentage': '68.18%', 'elapsed_time': '15m 14s', 'remaining_time': '7m 6s'}


Train:  69%|██████▉   | 320/462 [15:28<06:42,  2.83s/it]

{'loss': 0.43764243, 'acc': 0.87474461, 'grad_norm': 0.84792829, 'learning_rate': 2.377e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344611, 'epoch': 0.69, 'global_step/max_steps': '320/462', 'percentage': '69.26%', 'elapsed_time': '15m 28s', 'remaining_time': '6m 51s'}


Train:  70%|███████   | 325/462 [15:43<06:34,  2.88s/it]

{'loss': 0.41729293, 'acc': 0.87732449, 'grad_norm': 1.03248012, 'learning_rate': 2.226e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344524, 'epoch': 0.7, 'global_step/max_steps': '325/462', 'percentage': '70.35%', 'elapsed_time': '15m 42s', 'remaining_time': '6m 37s'}


Train:  71%|███████▏  | 330/462 [15:57<06:08,  2.79s/it]

{'loss': 0.43344598, 'acc': 0.87657528, 'grad_norm': 0.98176873, 'learning_rate': 2.079e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344715, 'epoch': 0.71, 'global_step/max_steps': '330/462', 'percentage': '71.43%', 'elapsed_time': '15m 56s', 'remaining_time': '6m 22s'}


Train:  73%|███████▎  | 335/462 [16:10<05:52,  2.78s/it]

{'loss': 0.45383439, 'acc': 0.87211895, 'grad_norm': 1.00486231, 'learning_rate': 1.935e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344901, 'epoch': 0.72, 'global_step/max_steps': '335/462', 'percentage': '72.51%', 'elapsed_time': '16m 10s', 'remaining_time': '6m 8s'}


Train:  74%|███████▎  | 340/462 [16:25<06:04,  2.99s/it]

{'loss': 0.37483313, 'acc': 0.89192867, 'grad_norm': 0.93388575, 'learning_rate': 1.795e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.344725, 'epoch': 0.74, 'global_step/max_steps': '340/462', 'percentage': '73.59%', 'elapsed_time': '16m 25s', 'remaining_time': '5m 53s'}


Train:  75%|███████▍  | 345/462 [16:40<05:32,  2.84s/it]

{'loss': 0.42937264, 'acc': 0.8813839, 'grad_norm': 1.0539397, 'learning_rate': 1.66e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.34486, 'epoch': 0.75, 'global_step/max_steps': '345/462', 'percentage': '74.68%', 'elapsed_time': '16m 40s', 'remaining_time': '5m 39s'}


Train:  76%|███████▌  | 350/462 [16:54<05:14,  2.81s/it]

{'loss': 0.40393028, 'acc': 0.88046932, 'grad_norm': 0.85154933, 'learning_rate': 1.528e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.344999, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '16m 54s', 'remaining_time': '5m 24s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.91it/s]/it]


{'eval_loss': 0.43777975, 'eval_acc': 0.86995792, 'eval_runtime': 3.5303, 'eval_samples_per_second': 20.961, 'eval_steps_per_second': 20.961, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '16m 57s', 'remaining_time': '5m 25s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-350
Train:  77%|███████▋  | 355/462 [17:12<05:36,  3.14s/it]

{'loss': 0.41759043, 'acc': 0.88197184, 'grad_norm': 0.96114713, 'learning_rate': 1.402e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.343581, 'epoch': 0.77, 'global_step/max_steps': '355/462', 'percentage': '76.84%', 'elapsed_time': '17m 12s', 'remaining_time': '5m 11s'}


Train:  78%|███████▊  | 360/462 [17:26<04:48,  2.83s/it]

{'loss': 0.44254646, 'acc': 0.87168818, 'grad_norm': 0.96762151, 'learning_rate': 1.279e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.343785, 'epoch': 0.78, 'global_step/max_steps': '360/462', 'percentage': '77.92%', 'elapsed_time': '17m 26s', 'remaining_time': '4m 56s'}


Train:  79%|███████▉  | 365/462 [17:40<04:29,  2.78s/it]

{'loss': 0.46085806, 'acc': 0.87282066, 'grad_norm': 0.99597031, 'learning_rate': 1.162e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.343967, 'epoch': 0.79, 'global_step/max_steps': '365/462', 'percentage': '79.00%', 'elapsed_time': '17m 40s', 'remaining_time': '4m 41s'}


Train:  80%|████████  | 370/462 [17:54<04:14,  2.77s/it]

{'loss': 0.39014974, 'acc': 0.88847075, 'grad_norm': 0.94709653, 'learning_rate': 1.05e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.344162, 'epoch': 0.8, 'global_step/max_steps': '370/462', 'percentage': '80.09%', 'elapsed_time': '17m 54s', 'remaining_time': '4m 27s'}


Train:  81%|████████  | 375/462 [18:09<04:04,  2.81s/it]

{'loss': 0.4033742, 'acc': 0.88656559, 'grad_norm': 0.8539477, 'learning_rate': 9.42e-06, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.344253, 'epoch': 0.81, 'global_step/max_steps': '375/462', 'percentage': '81.17%', 'elapsed_time': '18m 9s', 'remaining_time': '4m 12s'}


Train:  82%|████████▏ | 380/462 [18:23<03:49,  2.80s/it]

{'loss': 0.40679183, 'acc': 0.88613443, 'grad_norm': 0.84505373, 'learning_rate': 8.4e-06, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.344385, 'epoch': 0.82, 'global_step/max_steps': '380/462', 'percentage': '82.25%', 'elapsed_time': '18m 23s', 'remaining_time': '3m 58s'}


Train:  83%|████████▎ | 385/462 [18:37<03:35,  2.80s/it]

{'loss': 0.40197597, 'acc': 0.88090611, 'grad_norm': 1.01427925, 'learning_rate': 7.43e-06, 'memory(GiB)': 8.31, 'train_speed(iter/s)': 0.344476, 'epoch': 0.83, 'global_step/max_steps': '385/462', 'percentage': '83.33%', 'elapsed_time': '18m 37s', 'remaining_time': '3m 43s'}


Train:  84%|████████▍ | 390/462 [18:51<03:20,  2.78s/it]

{'loss': 0.43132739, 'acc': 0.87566309, 'grad_norm': 0.98452187, 'learning_rate': 6.52e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.344633, 'epoch': 0.84, 'global_step/max_steps': '390/462', 'percentage': '84.42%', 'elapsed_time': '18m 51s', 'remaining_time': '3m 28s'}


Train:  85%|████████▌ | 395/462 [19:05<03:12,  2.88s/it]

{'loss': 0.40652628, 'acc': 0.87975693, 'grad_norm': 0.76527208, 'learning_rate': 5.66e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.344692, 'epoch': 0.85, 'global_step/max_steps': '395/462', 'percentage': '85.50%', 'elapsed_time': '19m 5s', 'remaining_time': '3m 14s'}


Train:  87%|████████▋ | 400/462 [19:19<02:53,  2.80s/it]

{'loss': 0.42356701, 'acc': 0.8779748, 'grad_norm': 0.75377649, 'learning_rate': 4.86e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.344826, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '19m 19s', 'remaining_time': '2m 59s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.63it/s]/it]


{'eval_loss': 0.43722194, 'eval_acc': 0.86954737, 'eval_runtime': 3.5577, 'eval_samples_per_second': 20.8, 'eval_steps_per_second': 20.8, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '19m 23s', 'remaining_time': '3m 0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-400
Train:  88%|████████▊ | 405/462 [19:38<03:01,  3.18s/it]

{'loss': 0.41375279, 'acc': 0.88059292, 'grad_norm': 0.87284458, 'learning_rate': 4.12e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343584, 'epoch': 0.88, 'global_step/max_steps': '405/462', 'percentage': '87.66%', 'elapsed_time': '19m 38s', 'remaining_time': '2m 45s'}


Train:  89%|████████▊ | 410/462 [19:52<02:30,  2.90s/it]

{'loss': 0.41930542, 'acc': 0.87877216, 'grad_norm': 0.8474654, 'learning_rate': 3.44e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343592, 'epoch': 0.89, 'global_step/max_steps': '410/462', 'percentage': '88.74%', 'elapsed_time': '19m 52s', 'remaining_time': '2m 31s'}


Train:  90%|████████▉ | 415/462 [20:06<02:11,  2.79s/it]

{'loss': 0.42810163, 'acc': 0.87611351, 'grad_norm': 0.85608298, 'learning_rate': 2.81e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343759, 'epoch': 0.9, 'global_step/max_steps': '415/462', 'percentage': '89.83%', 'elapsed_time': '20m 6s', 'remaining_time': '2m 16s'}


Train:  91%|█████████ | 420/462 [20:20<01:57,  2.79s/it]

{'loss': 0.40582781, 'acc': 0.88450832, 'grad_norm': 0.90657645, 'learning_rate': 2.25e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343911, 'epoch': 0.91, 'global_step/max_steps': '420/462', 'percentage': '90.91%', 'elapsed_time': '20m 20s', 'remaining_time': '2m 2s'}


Train:  92%|█████████▏| 425/462 [20:35<01:47,  2.91s/it]

{'loss': 0.43628626, 'acc': 0.87747288, 'grad_norm': 0.84353089, 'learning_rate': 1.75e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343838, 'epoch': 0.92, 'global_step/max_steps': '425/462', 'percentage': '91.99%', 'elapsed_time': '20m 35s', 'remaining_time': '1m 47s'}


Train:  93%|█████████▎| 430/462 [20:50<01:34,  2.95s/it]

{'loss': 0.48727078, 'acc': 0.8668726, 'grad_norm': 0.91388267, 'learning_rate': 1.31e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343737, 'epoch': 0.93, 'global_step/max_steps': '430/462', 'percentage': '93.07%', 'elapsed_time': '20m 50s', 'remaining_time': '1m 33s'}


Train:  94%|█████████▍| 435/462 [21:05<01:18,  2.91s/it]

{'loss': 0.43495641, 'acc': 0.87271986, 'grad_norm': 0.85459065, 'learning_rate': 9.3e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343614, 'epoch': 0.94, 'global_step/max_steps': '435/462', 'percentage': '94.16%', 'elapsed_time': '21m 5s', 'remaining_time': '1m 18s'}


Train:  95%|█████████▌| 440/462 [21:19<01:02,  2.84s/it]

{'loss': 0.47182937, 'acc': 0.86950817, 'grad_norm': 0.96713126, 'learning_rate': 6.2e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343685, 'epoch': 0.95, 'global_step/max_steps': '440/462', 'percentage': '95.24%', 'elapsed_time': '21m 19s', 'remaining_time': '1m 3s'}


Train:  96%|█████████▋| 445/462 [21:33<00:47,  2.77s/it]

{'loss': 0.41961122, 'acc': 0.8789732, 'grad_norm': 0.97197586, 'learning_rate': 3.7e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343861, 'epoch': 0.96, 'global_step/max_steps': '445/462', 'percentage': '96.32%', 'elapsed_time': '21m 33s', 'remaining_time': '49s'}


Train:  97%|█████████▋| 450/462 [21:47<00:33,  2.76s/it]

{'loss': 0.42666392, 'acc': 0.88467884, 'grad_norm': 0.86686599, 'learning_rate': 1.9e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.344039, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '21m 47s', 'remaining_time': '34s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 21.03it/s]/it]


{'eval_loss': 0.43659654, 'eval_acc': 0.86954737, 'eval_runtime': 3.5105, 'eval_samples_per_second': 21.08, 'eval_steps_per_second': 21.08, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '21m 51s', 'remaining_time': '34s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-450
Train:  98%|█████████▊| 455/462 [22:06<00:21,  3.08s/it]

{'loss': 0.41813192, 'acc': 0.88008442, 'grad_norm': 0.86753607, 'learning_rate': 6e-08, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343032, 'epoch': 0.98, 'global_step/max_steps': '455/462', 'percentage': '98.48%', 'elapsed_time': '22m 6s', 'remaining_time': '20s'}


Train: 100%|█████████▉| 460/462 [22:19<00:05,  2.82s/it]

{'loss': 0.43385692, 'acc': 0.87394648, 'grad_norm': 0.89694482, 'learning_rate': 1e-08, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.343202, 'epoch': 0.99, 'global_step/max_steps': '460/462', 'percentage': '99.57%', 'elapsed_time': '22m 19s', 'remaining_time': '5s'}


Train: 100%|██████████| 462/462 [22:25<00:00,  2.80s/it]
Val: 100%|██████████| 74/74 [00:03<00:00, 21.11it/s]/it]


{'eval_loss': 0.43656504, 'eval_acc': 0.86965001, 'eval_runtime': 3.4886, 'eval_samples_per_second': 21.212, 'eval_steps_per_second': 21.212, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '22m 29s', 'remaining_time': '0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-462
Train: 100%|██████████| 462/462 [22:30<00:00,  2.92s/it]
[INFO:swift] last_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-462
[INFO:swift] best_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-462
[INFO:swift] images_dir: /root/autodl-fs/code/nlp_pj/pj6/output/qwen2_5-0_5b-instruct/v1-20241125-232429/images


{'train_runtime': 1349.9878, 'train_samples_per_second': 5.481, 'train_steps_per_second': 0.342, 'train_loss': 0.44557389, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '22m 30s', 'remaining_time': '0s'}


[INFO:swift] End time of running main: 2024-11-25 23:47:16.806765


In [None]:
model_type = ModelType.qwen2_5_0_5b_instruct
sft_args = SftArguments(
    model_type=model_type,
    resume_from_checkpoint='~/autodl-tmp/checkpoint/math_output/lora/qwen2_5-0_5b-instruct/v1-20241125-232429/checkpoint-462',
    resume_only_model=True,
    dataset=['gsm8k-train.jsonl'],
    output_dir='/root/autodl-tmp/checkpoint/math_output/deepseek',
    max_length=4096)
result = sft_main(sft_args)

[INFO:swift] Setting model_info['revision']: master
[INFO:swift] Setting args.lazy_tokenize: False
[INFO:swift] Setting args.dataloader_num_workers: 1
[INFO:swift] output_dir: /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329
[INFO:swift] Start time of running main: 2024-12-09 21:43:29.709374
[INFO:swift] args: SftArguments(model_type='qwen2_5-0_5b-instruct', model_id_or_path='qwen/Qwen2.5-0.5B-Instruct', model_revision='master', full_determinism=False, sft_type='lora', freeze_parameters=[], freeze_vit=False, freeze_parameters_ratio=0.0, additional_trainable_parameters=[], tuner_backend='peft', template_type='qwen2_5', output_dir='/root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329', add_output_dir_suffix=True, ddp_backend=None, ddp_find_unused_parameters=None, ddp_broadcast_buffers=None, ddp_timeout=1800, seed=42, resume_from_checkpoint='/root/autodl-fs/code/nlp_pj/pj6/output/lora/qwen2_5-0_5b-instruct/v1-2024

device_count: 1
rank: -1, local_rank: -1, world_size: 1, local_world_size: 1


[INFO:swift] Loading the model using model_dir: /root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] model.max_model_len: 32768
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/.cache/modelscope/hub/qwen/Qwen2___5-0___5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "

{'loss': 0.33236668, 'acc': 0.90519732, 'grad_norm': 0.73773295, 'learning_rate': 4.17e-06, 'memory(GiB)': 3.21, 'train_speed(iter/s)': 0.181089, 'epoch': 0.0, 'global_step/max_steps': '1/462', 'percentage': '0.22%', 'elapsed_time': '5s', 'remaining_time': '40m 57s'}


Train:   1%|          | 5/462 [00:18<26:39,  3.50s/it]

{'loss': 0.42283389, 'acc': 0.87544459, 'grad_norm': 0.85670811, 'learning_rate': 2.083e-05, 'memory(GiB)': 6.4, 'train_speed(iter/s)': 0.257955, 'epoch': 0.01, 'global_step/max_steps': '5/462', 'percentage': '1.08%', 'elapsed_time': '18s', 'remaining_time': '28m 39s'}


Train:   2%|▏         | 10/462 [00:36<25:18,  3.36s/it]

{'loss': 0.40474439, 'acc': 0.87984924, 'grad_norm': 0.73833615, 'learning_rate': 4.167e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.271674, 'epoch': 0.02, 'global_step/max_steps': '10/462', 'percentage': '2.16%', 'elapsed_time': '36s', 'remaining_time': '27m 27s'}


Train:   3%|▎         | 15/462 [00:52<24:02,  3.23s/it]

{'loss': 0.41364803, 'acc': 0.88559504, 'grad_norm': 1.07721686, 'learning_rate': 6.25e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.282352, 'epoch': 0.03, 'global_step/max_steps': '15/462', 'percentage': '3.25%', 'elapsed_time': '52s', 'remaining_time': '26m 10s'}


Train:   4%|▍         | 20/462 [01:08<22:38,  3.07s/it]

{'loss': 0.43422971, 'acc': 0.87387571, 'grad_norm': 0.85951394, 'learning_rate': 8.333e-05, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.291144, 'epoch': 0.04, 'global_step/max_steps': '20/462', 'percentage': '4.33%', 'elapsed_time': '1m 8s', 'remaining_time': '25m 16s'}


Train:   5%|▌         | 25/462 [01:25<22:42,  3.12s/it]

{'loss': 0.42334652, 'acc': 0.87678614, 'grad_norm': 0.84932297, 'learning_rate': 0.0001, 'memory(GiB)': 8.23, 'train_speed(iter/s)': 0.294425, 'epoch': 0.05, 'global_step/max_steps': '25/462', 'percentage': '5.41%', 'elapsed_time': '1m 25s', 'remaining_time': '24m 51s'}


Train:   6%|▋         | 30/462 [01:41<23:07,  3.21s/it]

{'loss': 0.46091256, 'acc': 0.87140656, 'grad_norm': 0.82068378, 'learning_rate': 9.995e-05, 'memory(GiB)': 8.56, 'train_speed(iter/s)': 0.293263, 'epoch': 0.06, 'global_step/max_steps': '30/462', 'percentage': '6.49%', 'elapsed_time': '1m 41s', 'remaining_time': '24m 28s'}


Train:   8%|▊         | 35/462 [01:57<21:21,  3.00s/it]

{'loss': 0.40109081, 'acc': 0.88266258, 'grad_norm': 0.87952006, 'learning_rate': 9.984e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.298349, 'epoch': 0.08, 'global_step/max_steps': '35/462', 'percentage': '7.58%', 'elapsed_time': '1m 57s', 'remaining_time': '23m 53s'}


Train:   9%|▊         | 40/462 [02:12<21:17,  3.03s/it]

{'loss': 0.39334357, 'acc': 0.89007187, 'grad_norm': 0.78826314, 'learning_rate': 9.967e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.300668, 'epoch': 0.09, 'global_step/max_steps': '40/462', 'percentage': '8.66%', 'elapsed_time': '2m 12s', 'remaining_time': '23m 17s'}


Train:  10%|▉         | 45/462 [02:27<20:16,  2.92s/it]

{'loss': 0.42644873, 'acc': 0.87783232, 'grad_norm': 1.00170255, 'learning_rate': 9.943e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.304543, 'epoch': 0.1, 'global_step/max_steps': '45/462', 'percentage': '9.74%', 'elapsed_time': '2m 27s', 'remaining_time': '22m 44s'}


Train:  11%|█         | 50/462 [02:42<20:11,  2.94s/it]

{'loss': 0.41294065, 'acc': 0.8829073, 'grad_norm': 0.76707244, 'learning_rate': 9.913e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.307252, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 42s', 'remaining_time': '22m 16s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 16.53it/s]it]


{'eval_loss': 0.43534681, 'eval_acc': 0.87057375, 'eval_runtime': 3.7866, 'eval_samples_per_second': 19.543, 'eval_steps_per_second': 19.543, 'epoch': 0.11, 'global_step/max_steps': '50/462', 'percentage': '10.82%', 'elapsed_time': '2m 46s', 'remaining_time': '22m 50s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-50
Train:  12%|█▏        | 55/462 [03:03<22:28,  3.31s/it]

{'loss': 0.42360897, 'acc': 0.8814579, 'grad_norm': 0.83831376, 'learning_rate': 9.877e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.300367, 'epoch': 0.12, 'global_step/max_steps': '55/462', 'percentage': '11.90%', 'elapsed_time': '3m 2s', 'remaining_time': '22m 32s'}


Train:  13%|█▎        | 60/462 [03:18<19:53,  2.97s/it]

{'loss': 0.43248372, 'acc': 0.87614365, 'grad_norm': 0.78866976, 'learning_rate': 9.834e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.303024, 'epoch': 0.13, 'global_step/max_steps': '60/462', 'percentage': '12.99%', 'elapsed_time': '3m 18s', 'remaining_time': '22m 7s'}


Train:  14%|█▍        | 65/462 [03:32<19:35,  2.96s/it]

{'loss': 0.41223135, 'acc': 0.88128643, 'grad_norm': 0.80654854, 'learning_rate': 9.785e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.304734, 'epoch': 0.14, 'global_step/max_steps': '65/462', 'percentage': '14.07%', 'elapsed_time': '3m 32s', 'remaining_time': '21m 40s'}


Train:  15%|█▌        | 70/462 [03:48<18:54,  2.89s/it]

{'loss': 0.41704745, 'acc': 0.88304768, 'grad_norm': 0.77472585, 'learning_rate': 9.73e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.307171, 'epoch': 0.15, 'global_step/max_steps': '70/462', 'percentage': '15.15%', 'elapsed_time': '3m 48s', 'remaining_time': '21m 20s'}


Train:  16%|█▌        | 75/462 [04:03<18:56,  2.94s/it]

{'loss': 0.40996842, 'acc': 0.87455759, 'grad_norm': 0.88464546, 'learning_rate': 9.669e-05, 'memory(GiB)': 2.25, 'train_speed(iter/s)': 0.30796, 'epoch': 0.16, 'global_step/max_steps': '75/462', 'percentage': '16.23%', 'elapsed_time': '4m 3s', 'remaining_time': '20m 54s'}


Train:  17%|█▋        | 80/462 [04:19<18:21,  2.88s/it]

{'loss': 0.41297402, 'acc': 0.8799592, 'grad_norm': 0.90926129, 'learning_rate': 9.602e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.309982, 'epoch': 0.17, 'global_step/max_steps': '80/462', 'percentage': '17.32%', 'elapsed_time': '4m 18s', 'remaining_time': '20m 36s'}


Train:  18%|█▊        | 85/462 [04:34<18:50,  3.00s/it]

{'loss': 0.43766923, 'acc': 0.87736473, 'grad_norm': 0.73013598, 'learning_rate': 9.529e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.309945, 'epoch': 0.18, 'global_step/max_steps': '85/462', 'percentage': '18.40%', 'elapsed_time': '4m 34s', 'remaining_time': '20m 16s'}


Train:  19%|█▉        | 90/462 [04:48<18:17,  2.95s/it]

{'loss': 0.40822554, 'acc': 0.87526474, 'grad_norm': 0.87729508, 'learning_rate': 9.45e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.310956, 'epoch': 0.19, 'global_step/max_steps': '90/462', 'percentage': '19.48%', 'elapsed_time': '4m 48s', 'remaining_time': '19m 53s'}


Train:  21%|██        | 95/462 [05:03<17:39,  2.89s/it]

{'loss': 0.43870111, 'acc': 0.87625427, 'grad_norm': 0.91428322, 'learning_rate': 9.366e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.312625, 'epoch': 0.21, 'global_step/max_steps': '95/462', 'percentage': '20.56%', 'elapsed_time': '5m 3s', 'remaining_time': '19m 32s'}


Train:  22%|██▏       | 100/462 [05:18<17:15,  2.86s/it]

{'loss': 0.38483315, 'acc': 0.8876647, 'grad_norm': 0.79695523, 'learning_rate': 9.275e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.314038, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '5m 18s', 'remaining_time': '19m 11s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.16it/s]/it]


{'eval_loss': 0.43611097, 'eval_acc': 0.86913682, 'eval_runtime': 3.6787, 'eval_samples_per_second': 20.116, 'eval_steps_per_second': 20.116, 'epoch': 0.22, 'global_step/max_steps': '100/462', 'percentage': '21.65%', 'elapsed_time': '5m 21s', 'remaining_time': '19m 25s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-100
Train:  23%|██▎       | 105/462 [05:37<19:12,  3.23s/it]

{'loss': 0.39544749, 'acc': 0.88961658, 'grad_norm': 0.84849834, 'learning_rate': 9.18e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.31087, 'epoch': 0.23, 'global_step/max_steps': '105/462', 'percentage': '22.73%', 'elapsed_time': '5m 37s', 'remaining_time': '19m 6s'}


Train:  24%|██▍       | 110/462 [05:51<17:19,  2.95s/it]

{'loss': 0.43400269, 'acc': 0.87625694, 'grad_norm': 0.857858, 'learning_rate': 9.079e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.312111, 'epoch': 0.24, 'global_step/max_steps': '110/462', 'percentage': '23.81%', 'elapsed_time': '5m 51s', 'remaining_time': '18m 45s'}


Train:  25%|██▍       | 115/462 [06:06<16:34,  2.87s/it]

{'loss': 0.44738936, 'acc': 0.87166119, 'grad_norm': 0.88968444, 'learning_rate': 8.972e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.313473, 'epoch': 0.25, 'global_step/max_steps': '115/462', 'percentage': '24.89%', 'elapsed_time': '6m 6s', 'remaining_time': '18m 25s'}


Train:  26%|██▌       | 120/462 [06:20<16:09,  2.84s/it]

{'loss': 0.41862922, 'acc': 0.88188124, 'grad_norm': 0.83744591, 'learning_rate': 8.861e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.314834, 'epoch': 0.26, 'global_step/max_steps': '120/462', 'percentage': '25.97%', 'elapsed_time': '6m 20s', 'remaining_time': '18m 4s'}


Train:  27%|██▋       | 125/462 [06:35<16:09,  2.88s/it]

{'loss': 0.45534334, 'acc': 0.8731019, 'grad_norm': 0.9925164, 'learning_rate': 8.744e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.315855, 'epoch': 0.27, 'global_step/max_steps': '125/462', 'percentage': '27.06%', 'elapsed_time': '6m 35s', 'remaining_time': '17m 45s'}


Train:  28%|██▊       | 130/462 [06:49<15:48,  2.86s/it]

{'loss': 0.37152703, 'acc': 0.90047159, 'grad_norm': 0.82462174, 'learning_rate': 8.623e-05, 'memory(GiB)': 3.25, 'train_speed(iter/s)': 0.316957, 'epoch': 0.28, 'global_step/max_steps': '130/462', 'percentage': '28.14%', 'elapsed_time': '6m 49s', 'remaining_time': '17m 25s'}


Train:  29%|██▉       | 135/462 [07:03<15:29,  2.84s/it]

{'loss': 0.42962303, 'acc': 0.87585735, 'grad_norm': 0.96719414, 'learning_rate': 8.497e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.318064, 'epoch': 0.29, 'global_step/max_steps': '135/462', 'percentage': '29.22%', 'elapsed_time': '7m 3s', 'remaining_time': '17m 6s'}


Train:  30%|███       | 140/462 [07:18<15:22,  2.87s/it]

{'loss': 0.42159557, 'acc': 0.88450317, 'grad_norm': 0.78089923, 'learning_rate': 8.367e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.319028, 'epoch': 0.3, 'global_step/max_steps': '140/462', 'percentage': '30.30%', 'elapsed_time': '7m 18s', 'remaining_time': '16m 48s'}


Train:  31%|███▏      | 145/462 [07:32<15:00,  2.84s/it]

{'loss': 0.43039632, 'acc': 0.87500105, 'grad_norm': 0.95277458, 'learning_rate': 8.232e-05, 'memory(GiB)': 4.29, 'train_speed(iter/s)': 0.319981, 'epoch': 0.31, 'global_step/max_steps': '145/462', 'percentage': '31.39%', 'elapsed_time': '7m 32s', 'remaining_time': '16m 29s'}


Train:  32%|███▏      | 150/462 [07:47<15:29,  2.98s/it]

{'loss': 0.4333138, 'acc': 0.87403994, 'grad_norm': 0.90271562, 'learning_rate': 8.093e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.320404, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 47s', 'remaining_time': '16m 12s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 16.65it/s]/it]


{'eval_loss': 0.4331795, 'eval_acc': 0.8718054, 'eval_runtime': 4.4608, 'eval_samples_per_second': 16.589, 'eval_steps_per_second': 16.589, 'epoch': 0.32, 'global_step/max_steps': '150/462', 'percentage': '32.47%', 'elapsed_time': '7m 52s', 'remaining_time': '16m 22s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-150
Train:  34%|███▎      | 155/462 [08:07<16:28,  3.22s/it]

{'loss': 0.37574465, 'acc': 0.89173193, 'grad_norm': 0.74958807, 'learning_rate': 7.951e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.317754, 'epoch': 0.34, 'global_step/max_steps': '155/462', 'percentage': '33.55%', 'elapsed_time': '8m 7s', 'remaining_time': '16m 5s'}


Train:  35%|███▍      | 160/462 [08:21<14:26,  2.87s/it]

{'loss': 0.43160443, 'acc': 0.88507328, 'grad_norm': 0.87229288, 'learning_rate': 7.804e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.318778, 'epoch': 0.35, 'global_step/max_steps': '160/462', 'percentage': '34.63%', 'elapsed_time': '8m 21s', 'remaining_time': '15m 46s'}


Train:  36%|███▌      | 165/462 [08:35<13:59,  2.83s/it]

{'loss': 0.42026367, 'acc': 0.87784357, 'grad_norm': 0.82556683, 'learning_rate': 7.654e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.319741, 'epoch': 0.36, 'global_step/max_steps': '165/462', 'percentage': '35.71%', 'elapsed_time': '8m 35s', 'remaining_time': '15m 27s'}


Train:  37%|███▋      | 170/462 [08:49<13:38,  2.80s/it]

{'loss': 0.41160889, 'acc': 0.88473854, 'grad_norm': 0.7951588, 'learning_rate': 7.5e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.320626, 'epoch': 0.37, 'global_step/max_steps': '170/462', 'percentage': '36.80%', 'elapsed_time': '8m 49s', 'remaining_time': '15m 9s'}


Train:  38%|███▊      | 175/462 [09:03<13:22,  2.80s/it]

{'loss': 0.46362596, 'acc': 0.87043877, 'grad_norm': 0.85724092, 'learning_rate': 7.343e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.321478, 'epoch': 0.38, 'global_step/max_steps': '175/462', 'percentage': '37.88%', 'elapsed_time': '9m 3s', 'remaining_time': '14m 51s'}


Train:  39%|███▉      | 180/462 [09:17<13:11,  2.81s/it]

{'loss': 0.45110412, 'acc': 0.87293034, 'grad_norm': 0.91844821, 'learning_rate': 7.183e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.322301, 'epoch': 0.39, 'global_step/max_steps': '180/462', 'percentage': '38.96%', 'elapsed_time': '9m 17s', 'remaining_time': '14m 34s'}


Train:  40%|████      | 185/462 [09:31<12:55,  2.80s/it]

{'loss': 0.45509734, 'acc': 0.87525616, 'grad_norm': 0.83037525, 'learning_rate': 7.021e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.323111, 'epoch': 0.4, 'global_step/max_steps': '185/462', 'percentage': '40.04%', 'elapsed_time': '9m 31s', 'remaining_time': '14m 16s'}


Train:  41%|████      | 190/462 [09:45<12:32,  2.77s/it]

{'loss': 0.38921607, 'acc': 0.89342604, 'grad_norm': 0.89717293, 'learning_rate': 6.855e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.323964, 'epoch': 0.41, 'global_step/max_steps': '190/462', 'percentage': '41.13%', 'elapsed_time': '9m 45s', 'remaining_time': '13m 58s'}


Train:  42%|████▏     | 195/462 [10:00<12:54,  2.90s/it]

{'loss': 0.40006323, 'acc': 0.88281565, 'grad_norm': 0.88841665, 'learning_rate': 6.688e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.324463, 'epoch': 0.42, 'global_step/max_steps': '195/462', 'percentage': '42.21%', 'elapsed_time': '10m 0s', 'remaining_time': '13m 42s'}


Train:  43%|████▎     | 200/462 [10:15<12:45,  2.92s/it]

{'loss': 0.40202012, 'acc': 0.88903809, 'grad_norm': 1.09904969, 'learning_rate': 6.518e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.32475, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '10m 15s', 'remaining_time': '13m 25s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.86it/s]/it]


{'eval_loss': 0.4322798, 'eval_acc': 0.87283178, 'eval_runtime': 3.5511, 'eval_samples_per_second': 20.839, 'eval_steps_per_second': 20.839, 'epoch': 0.43, 'global_step/max_steps': '200/462', 'percentage': '43.29%', 'elapsed_time': '10m 18s', 'remaining_time': '13m 30s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-200
Train:  44%|████▍     | 205/462 [10:33<13:30,  3.15s/it]

{'loss': 0.40825367, 'acc': 0.88222885, 'grad_norm': 0.77460879, 'learning_rate': 6.346e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.323083, 'epoch': 0.44, 'global_step/max_steps': '205/462', 'percentage': '44.37%', 'elapsed_time': '10m 33s', 'remaining_time': '13m 14s'}


Train:  45%|████▌     | 210/462 [10:48<12:01,  2.86s/it]

{'loss': 0.42172127, 'acc': 0.87650099, 'grad_norm': 0.89727443, 'learning_rate': 6.172e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.323771, 'epoch': 0.45, 'global_step/max_steps': '210/462', 'percentage': '45.45%', 'elapsed_time': '10m 48s', 'remaining_time': '12m 57s'}


Train:  47%|████▋     | 215/462 [11:02<11:33,  2.81s/it]

{'loss': 0.41117101, 'acc': 0.8816576, 'grad_norm': 0.83896971, 'learning_rate': 5.997e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.324422, 'epoch': 0.46, 'global_step/max_steps': '215/462', 'percentage': '46.54%', 'elapsed_time': '11m 2s', 'remaining_time': '12m 40s'}


Train:  48%|████▊     | 220/462 [11:16<11:16,  2.79s/it]

{'loss': 0.39925132, 'acc': 0.88747568, 'grad_norm': 0.87322235, 'learning_rate': 5.821e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.325056, 'epoch': 0.48, 'global_step/max_steps': '220/462', 'percentage': '47.62%', 'elapsed_time': '11m 16s', 'remaining_time': '12m 23s'}


Train:  49%|████▊     | 225/462 [11:30<11:01,  2.79s/it]

{'loss': 0.44141474, 'acc': 0.8775528, 'grad_norm': 0.68479204, 'learning_rate': 5.644e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.325691, 'epoch': 0.49, 'global_step/max_steps': '225/462', 'percentage': '48.70%', 'elapsed_time': '11m 30s', 'remaining_time': '12m 7s'}


Train:  50%|████▉     | 230/462 [11:44<10:41,  2.77s/it]

{'loss': 0.4302598, 'acc': 0.88507605, 'grad_norm': 0.87211251, 'learning_rate': 5.466e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.326366, 'epoch': 0.5, 'global_step/max_steps': '230/462', 'percentage': '49.78%', 'elapsed_time': '11m 44s', 'remaining_time': '11m 50s'}


Train:  51%|█████     | 235/462 [11:58<10:28,  2.77s/it]

{'loss': 0.45279698, 'acc': 0.8717577, 'grad_norm': 0.94517004, 'learning_rate': 5.287e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.326983, 'epoch': 0.51, 'global_step/max_steps': '235/462', 'percentage': '50.87%', 'elapsed_time': '11m 58s', 'remaining_time': '11m 33s'}


Train:  52%|█████▏    | 240/462 [12:12<10:14,  2.77s/it]

{'loss': 0.38484833, 'acc': 0.88767023, 'grad_norm': 0.8527351, 'learning_rate': 5.108e-05, 'memory(GiB)': 5.36, 'train_speed(iter/s)': 0.327575, 'epoch': 0.52, 'global_step/max_steps': '240/462', 'percentage': '51.95%', 'elapsed_time': '12m 12s', 'remaining_time': '11m 17s'}


Train:  53%|█████▎    | 245/462 [12:26<10:06,  2.79s/it]

{'loss': 0.3876312, 'acc': 0.88989286, 'grad_norm': 0.82829338, 'learning_rate': 4.928e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.328116, 'epoch': 0.53, 'global_step/max_steps': '245/462', 'percentage': '53.03%', 'elapsed_time': '12m 26s', 'remaining_time': '11m 0s'}


Train:  54%|█████▍    | 250/462 [12:40<09:47,  2.77s/it]

{'loss': 0.40613437, 'acc': 0.88402405, 'grad_norm': 0.90533811, 'learning_rate': 4.749e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.328682, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 40s', 'remaining_time': '10m 44s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.64it/s]/it]


{'eval_loss': 0.43066493, 'eval_acc': 0.87129221, 'eval_runtime': 3.5894, 'eval_samples_per_second': 20.616, 'eval_steps_per_second': 20.616, 'epoch': 0.54, 'global_step/max_steps': '250/462', 'percentage': '54.11%', 'elapsed_time': '12m 43s', 'remaining_time': '10m 47s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-250
Train:  55%|█████▌    | 255/462 [12:58<10:41,  3.10s/it]

{'loss': 0.39015563, 'acc': 0.88778524, 'grad_norm': 0.98734921, 'learning_rate': 4.57e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.327294, 'epoch': 0.55, 'global_step/max_steps': '255/462', 'percentage': '55.19%', 'elapsed_time': '12m 58s', 'remaining_time': '10m 32s'}


Train:  56%|█████▋    | 260/462 [13:13<09:31,  2.83s/it]

{'loss': 0.42277737, 'acc': 0.87877226, 'grad_norm': 0.7611379, 'learning_rate': 4.392e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.327799, 'epoch': 0.56, 'global_step/max_steps': '260/462', 'percentage': '56.28%', 'elapsed_time': '13m 13s', 'remaining_time': '10m 16s'}


Train:  57%|█████▋    | 265/462 [13:27<09:17,  2.83s/it]

{'loss': 0.40877199, 'acc': 0.87974348, 'grad_norm': 0.84965622, 'learning_rate': 4.214e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.328058, 'epoch': 0.57, 'global_step/max_steps': '265/462', 'percentage': '57.36%', 'elapsed_time': '13m 27s', 'remaining_time': '10m 0s'}


Train:  58%|█████▊    | 270/462 [13:41<08:57,  2.80s/it]

{'loss': 0.39905274, 'acc': 0.88695841, 'grad_norm': 0.93498087, 'learning_rate': 4.038e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.32853, 'epoch': 0.58, 'global_step/max_steps': '270/462', 'percentage': '58.44%', 'elapsed_time': '13m 41s', 'remaining_time': '9m 44s'}


Train:  60%|█████▉    | 275/462 [13:55<08:44,  2.81s/it]

{'loss': 0.46900425, 'acc': 0.86753387, 'grad_norm': 1.02868712, 'learning_rate': 3.862e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.328952, 'epoch': 0.59, 'global_step/max_steps': '275/462', 'percentage': '59.52%', 'elapsed_time': '13m 55s', 'remaining_time': '9m 28s'}


Train:  61%|██████    | 280/462 [14:09<08:27,  2.79s/it]

{'loss': 0.43129635, 'acc': 0.87632904, 'grad_norm': 1.00259638, 'learning_rate': 3.689e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.329391, 'epoch': 0.61, 'global_step/max_steps': '280/462', 'percentage': '60.61%', 'elapsed_time': '14m 9s', 'remaining_time': '9m 12s'}


Train:  62%|██████▏   | 285/462 [14:23<08:14,  2.79s/it]

{'loss': 0.37302105, 'acc': 0.89343863, 'grad_norm': 0.875256, 'learning_rate': 3.516e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.329815, 'epoch': 0.62, 'global_step/max_steps': '285/462', 'percentage': '61.69%', 'elapsed_time': '14m 23s', 'remaining_time': '8m 56s'}


Train:  63%|██████▎   | 290/462 [14:37<07:58,  2.78s/it]

{'loss': 0.3964566, 'acc': 0.88044901, 'grad_norm': 0.843885, 'learning_rate': 3.346e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.330254, 'epoch': 0.63, 'global_step/max_steps': '290/462', 'percentage': '62.77%', 'elapsed_time': '14m 37s', 'remaining_time': '8m 40s'}


Train:  64%|██████▍   | 295/462 [14:51<07:43,  2.77s/it]

{'loss': 0.39523678, 'acc': 0.88992319, 'grad_norm': 0.84114182, 'learning_rate': 3.178e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.330681, 'epoch': 0.64, 'global_step/max_steps': '295/462', 'percentage': '63.85%', 'elapsed_time': '14m 51s', 'remaining_time': '8m 24s'}


Train:  65%|██████▍   | 300/462 [15:05<07:32,  2.79s/it]

{'loss': 0.43200812, 'acc': 0.87422123, 'grad_norm': 0.84136581, 'learning_rate': 3.012e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.331059, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '15m 5s', 'remaining_time': '8m 9s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.70it/s]/it]


{'eval_loss': 0.43024224, 'eval_acc': 0.8726265, 'eval_runtime': 3.569, 'eval_samples_per_second': 20.734, 'eval_steps_per_second': 20.734, 'epoch': 0.65, 'global_step/max_steps': '300/462', 'percentage': '64.94%', 'elapsed_time': '15m 9s', 'remaining_time': '8m 11s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-300
Train:  66%|██████▌   | 305/462 [15:24<08:14,  3.15s/it]

{'loss': 0.4367877, 'acc': 0.88119011, 'grad_norm': 0.8714686, 'learning_rate': 2.849e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.329711, 'epoch': 0.66, 'global_step/max_steps': '305/462', 'percentage': '66.02%', 'elapsed_time': '15m 24s', 'remaining_time': '7m 55s'}


Train:  67%|██████▋   | 310/462 [15:38<07:15,  2.86s/it]

{'loss': 0.37596464, 'acc': 0.89398317, 'grad_norm': 0.89937401, 'learning_rate': 2.689e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.330083, 'epoch': 0.67, 'global_step/max_steps': '310/462', 'percentage': '67.10%', 'elapsed_time': '15m 38s', 'remaining_time': '7m 40s'}


Train:  68%|██████▊   | 315/462 [15:52<06:52,  2.81s/it]

{'loss': 0.42956014, 'acc': 0.88200293, 'grad_norm': 0.87038529, 'learning_rate': 2.531e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.330455, 'epoch': 0.68, 'global_step/max_steps': '315/462', 'percentage': '68.18%', 'elapsed_time': '15m 52s', 'remaining_time': '7m 24s'}


Train:  69%|██████▉   | 320/462 [16:06<06:37,  2.80s/it]

{'loss': 0.42707105, 'acc': 0.878405, 'grad_norm': 0.79014498, 'learning_rate': 2.377e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.330793, 'epoch': 0.69, 'global_step/max_steps': '320/462', 'percentage': '69.26%', 'elapsed_time': '16m 6s', 'remaining_time': '7m 9s'}


Train:  70%|███████   | 325/462 [16:20<06:22,  2.79s/it]

{'loss': 0.40841742, 'acc': 0.88120117, 'grad_norm': 1.08930814, 'learning_rate': 2.226e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.331134, 'epoch': 0.7, 'global_step/max_steps': '325/462', 'percentage': '70.35%', 'elapsed_time': '16m 20s', 'remaining_time': '6m 53s'}


Train:  71%|███████▏  | 330/462 [16:35<06:08,  2.79s/it]

{'loss': 0.42472067, 'acc': 0.87871714, 'grad_norm': 0.99994957, 'learning_rate': 2.079e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.331485, 'epoch': 0.71, 'global_step/max_steps': '330/462', 'percentage': '71.43%', 'elapsed_time': '16m 34s', 'remaining_time': '6m 37s'}


Train:  73%|███████▎  | 335/462 [16:49<06:00,  2.84s/it]

{'loss': 0.44544539, 'acc': 0.87512178, 'grad_norm': 1.02056921, 'learning_rate': 1.935e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.331533, 'epoch': 0.72, 'global_step/max_steps': '335/462', 'percentage': '72.51%', 'elapsed_time': '16m 49s', 'remaining_time': '6m 22s'}


Train:  74%|███████▎  | 340/462 [17:03<05:39,  2.79s/it]

{'loss': 0.36765707, 'acc': 0.89457407, 'grad_norm': 1.01563561, 'learning_rate': 1.795e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.331884, 'epoch': 0.74, 'global_step/max_steps': '340/462', 'percentage': '73.59%', 'elapsed_time': '17m 3s', 'remaining_time': '6m 7s'}


Train:  75%|███████▍  | 345/462 [17:18<05:26,  2.79s/it]

{'loss': 0.41896157, 'acc': 0.88344431, 'grad_norm': 1.03732395, 'learning_rate': 1.66e-05, 'memory(GiB)': 6.45, 'train_speed(iter/s)': 0.332183, 'epoch': 0.75, 'global_step/max_steps': '345/462', 'percentage': '74.68%', 'elapsed_time': '17m 18s', 'remaining_time': '5m 52s'}


Train:  76%|███████▌  | 350/462 [17:32<05:13,  2.80s/it]

{'loss': 0.39783359, 'acc': 0.88255873, 'grad_norm': 0.85950315, 'learning_rate': 1.528e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.332465, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '17m 32s', 'remaining_time': '5m 36s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 16.90it/s]/it]


{'eval_loss': 0.42960051, 'eval_acc': 0.87201067, 'eval_runtime': 3.5751, 'eval_samples_per_second': 20.699, 'eval_steps_per_second': 20.699, 'epoch': 0.76, 'global_step/max_steps': '350/462', 'percentage': '75.76%', 'elapsed_time': '17m 37s', 'remaining_time': '5m 38s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-350
Train:  77%|███████▋  | 355/462 [17:53<05:53,  3.30s/it]

{'loss': 0.40924296, 'acc': 0.88343334, 'grad_norm': 0.98651195, 'learning_rate': 1.402e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.330591, 'epoch': 0.77, 'global_step/max_steps': '355/462', 'percentage': '76.84%', 'elapsed_time': '17m 53s', 'remaining_time': '5m 23s'}


Train:  78%|███████▊  | 360/462 [18:08<04:56,  2.91s/it]

{'loss': 0.43473897, 'acc': 0.8762825, 'grad_norm': 0.93761587, 'learning_rate': 1.279e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.330782, 'epoch': 0.78, 'global_step/max_steps': '360/462', 'percentage': '77.92%', 'elapsed_time': '18m 7s', 'remaining_time': '5m 8s'}


Train:  79%|███████▉  | 365/462 [18:22<04:34,  2.83s/it]

{'loss': 0.45346122, 'acc': 0.87726536, 'grad_norm': 0.89926088, 'learning_rate': 1.162e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.330989, 'epoch': 0.79, 'global_step/max_steps': '365/462', 'percentage': '79.00%', 'elapsed_time': '18m 22s', 'remaining_time': '4m 52s'}


Train:  80%|████████  | 370/462 [18:37<04:23,  2.86s/it]

{'loss': 0.38205791, 'acc': 0.88931932, 'grad_norm': 0.95226693, 'learning_rate': 1.05e-05, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.331048, 'epoch': 0.8, 'global_step/max_steps': '370/462', 'percentage': '80.09%', 'elapsed_time': '18m 37s', 'remaining_time': '4m 37s'}


Train:  81%|████████  | 375/462 [18:51<04:04,  2.82s/it]

{'loss': 0.39556544, 'acc': 0.88953457, 'grad_norm': 0.8461116, 'learning_rate': 9.42e-06, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.331327, 'epoch': 0.81, 'global_step/max_steps': '375/462', 'percentage': '81.17%', 'elapsed_time': '18m 51s', 'remaining_time': '4m 22s'}


Train:  82%|████████▏ | 380/462 [19:05<03:50,  2.81s/it]

{'loss': 0.39889789, 'acc': 0.8880043, 'grad_norm': 0.85528046, 'learning_rate': 8.4e-06, 'memory(GiB)': 7.55, 'train_speed(iter/s)': 0.331558, 'epoch': 0.82, 'global_step/max_steps': '380/462', 'percentage': '82.25%', 'elapsed_time': '19m 5s', 'remaining_time': '4m 7s'}


Train:  83%|████████▎ | 385/462 [19:20<03:48,  2.96s/it]

{'loss': 0.39577029, 'acc': 0.88321772, 'grad_norm': 1.08996642, 'learning_rate': 7.43e-06, 'memory(GiB)': 8.31, 'train_speed(iter/s)': 0.33152, 'epoch': 0.83, 'global_step/max_steps': '385/462', 'percentage': '83.33%', 'elapsed_time': '19m 20s', 'remaining_time': '3m 52s'}


Train:  84%|████████▍ | 390/462 [19:35<03:22,  2.82s/it]

{'loss': 0.4265738, 'acc': 0.87573872, 'grad_norm': 0.98954219, 'learning_rate': 6.52e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.331768, 'epoch': 0.84, 'global_step/max_steps': '390/462', 'percentage': '84.42%', 'elapsed_time': '19m 34s', 'remaining_time': '3m 36s'}


Train:  85%|████████▌ | 395/462 [19:49<03:06,  2.79s/it]

{'loss': 0.40145974, 'acc': 0.88122559, 'grad_norm': 0.78503913, 'learning_rate': 5.66e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332052, 'epoch': 0.85, 'global_step/max_steps': '395/462', 'percentage': '85.50%', 'elapsed_time': '19m 49s', 'remaining_time': '3m 21s'}


Train:  87%|████████▋ | 400/462 [20:04<02:57,  2.86s/it]

{'loss': 0.41636968, 'acc': 0.88070459, 'grad_norm': 0.73468322, 'learning_rate': 4.86e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332072, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '20m 4s', 'remaining_time': '3m 6s'}



Val: 100%|██████████| 74/74 [00:04<00:00, 15.54it/s]/it]


{'eval_loss': 0.42884487, 'eval_acc': 0.87252386, 'eval_runtime': 3.5798, 'eval_samples_per_second': 20.672, 'eval_steps_per_second': 20.672, 'epoch': 0.86, 'global_step/max_steps': '400/462', 'percentage': '86.58%', 'elapsed_time': '20m 9s', 'remaining_time': '3m 7s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-400
Train:  88%|████████▊ | 405/462 [20:24<03:04,  3.23s/it]

{'loss': 0.40595789, 'acc': 0.88368454, 'grad_norm': 0.85286325, 'learning_rate': 4.12e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.330725, 'epoch': 0.88, 'global_step/max_steps': '405/462', 'percentage': '87.66%', 'elapsed_time': '20m 24s', 'remaining_time': '2m 52s'}


Train:  89%|████████▊ | 410/462 [20:37<02:27,  2.84s/it]

{'loss': 0.41216197, 'acc': 0.88164387, 'grad_norm': 0.87484843, 'learning_rate': 3.44e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.331054, 'epoch': 0.89, 'global_step/max_steps': '410/462', 'percentage': '88.74%', 'elapsed_time': '20m 37s', 'remaining_time': '2m 37s'}


Train:  90%|████████▉ | 415/462 [20:51<02:10,  2.78s/it]

{'loss': 0.42160602, 'acc': 0.87750359, 'grad_norm': 0.91172439, 'learning_rate': 2.81e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.331349, 'epoch': 0.9, 'global_step/max_steps': '415/462', 'percentage': '89.83%', 'elapsed_time': '20m 51s', 'remaining_time': '2m 21s'}


Train:  91%|█████████ | 420/462 [21:05<01:56,  2.79s/it]

{'loss': 0.3999743, 'acc': 0.88663568, 'grad_norm': 0.92454809, 'learning_rate': 2.25e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.33162, 'epoch': 0.91, 'global_step/max_steps': '420/462', 'percentage': '90.91%', 'elapsed_time': '21m 5s', 'remaining_time': '2m 6s'}


Train:  92%|█████████▏| 425/462 [21:19<01:42,  2.78s/it]

{'loss': 0.43025627, 'acc': 0.87837324, 'grad_norm': 0.86141276, 'learning_rate': 1.75e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.331904, 'epoch': 0.92, 'global_step/max_steps': '425/462', 'percentage': '91.99%', 'elapsed_time': '21m 19s', 'remaining_time': '1m 51s'}


Train:  93%|█████████▎| 430/462 [21:33<01:28,  2.77s/it]

{'loss': 0.48124161, 'acc': 0.86716604, 'grad_norm': 0.91184419, 'learning_rate': 1.31e-06, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332184, 'epoch': 0.93, 'global_step/max_steps': '430/462', 'percentage': '93.07%', 'elapsed_time': '21m 33s', 'remaining_time': '1m 36s'}


Train:  94%|█████████▍| 435/462 [21:47<01:14,  2.78s/it]

{'loss': 0.42900085, 'acc': 0.87547712, 'grad_norm': 0.86343181, 'learning_rate': 9.3e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332453, 'epoch': 0.94, 'global_step/max_steps': '435/462', 'percentage': '94.16%', 'elapsed_time': '21m 47s', 'remaining_time': '1m 21s'}


Train:  95%|█████████▌| 440/462 [22:01<01:01,  2.78s/it]

{'loss': 0.46473112, 'acc': 0.87145224, 'grad_norm': 0.98853517, 'learning_rate': 6.2e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332714, 'epoch': 0.95, 'global_step/max_steps': '440/462', 'percentage': '95.24%', 'elapsed_time': '22m 1s', 'remaining_time': '1m 6s'}


Train:  96%|█████████▋| 445/462 [22:15<00:47,  2.78s/it]

{'loss': 0.41415448, 'acc': 0.88200302, 'grad_norm': 0.97976071, 'learning_rate': 3.7e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332971, 'epoch': 0.96, 'global_step/max_steps': '445/462', 'percentage': '96.32%', 'elapsed_time': '22m 15s', 'remaining_time': '51s'}


Train:  97%|█████████▋| 450/462 [22:30<00:33,  2.81s/it]

{'loss': 0.41948671, 'acc': 0.88555565, 'grad_norm': 0.87642658, 'learning_rate': 1.9e-07, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.333187, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '22m 29s', 'remaining_time': '35s'}



Val: 100%|██████████| 74/74 [00:03<00:00, 20.48it/s]/it]


{'eval_loss': 0.42906445, 'eval_acc': 0.87211331, 'eval_runtime': 3.5973, 'eval_samples_per_second': 20.571, 'eval_steps_per_second': 20.571, 'epoch': 0.97, 'global_step/max_steps': '450/462', 'percentage': '97.40%', 'elapsed_time': '22m 33s', 'remaining_time': '36s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-450
Train:  98%|█████████▊| 455/462 [22:48<00:21,  3.12s/it]

{'loss': 0.41153564, 'acc': 0.88095789, 'grad_norm': 0.86814803, 'learning_rate': 6e-08, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332309, 'epoch': 0.98, 'global_step/max_steps': '455/462', 'percentage': '98.48%', 'elapsed_time': '22m 48s', 'remaining_time': '21s'}


Train: 100%|█████████▉| 460/462 [23:02<00:05,  2.84s/it]

{'loss': 0.42737613, 'acc': 0.87427845, 'grad_norm': 0.82542026, 'learning_rate': 1e-08, 'memory(GiB)': 2.4, 'train_speed(iter/s)': 0.332549, 'epoch': 0.99, 'global_step/max_steps': '460/462', 'percentage': '99.57%', 'elapsed_time': '23m 2s', 'remaining_time': '6s'}


Train: 100%|██████████| 462/462 [23:08<00:00,  2.85s/it]
Val: 100%|██████████| 74/74 [00:03<00:00, 20.67it/s]/it]


{'eval_loss': 0.42900014, 'eval_acc': 0.87252386, 'eval_runtime': 3.5451, 'eval_samples_per_second': 20.874, 'eval_steps_per_second': 20.874, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '23m 11s', 'remaining_time': '0s'}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[INFO:swift] Saving model checkpoint to /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-462
Train: 100%|██████████| 462/462 [23:13<00:00,  3.02s/it]
[INFO:swift] last_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-462
[INFO:swift] best_model_checkpoint: /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/checkpoint-400
[INFO:swift] images_dir: /root/autodl-fs/code/nlp_pj/pj6/output/deepseek/qwen2_5-0_5b-instruct/v1-20241209-214329/images


{'train_runtime': 1393.15, 'train_samples_per_second': 5.311, 'train_steps_per_second': 0.332, 'train_loss': 0.41776851, 'epoch': 1.0, 'global_step/max_steps': '462/462', 'percentage': '100.00%', 'elapsed_time': '23m 13s', 'remaining_time': '0s'}


[INFO:swift] End time of running main: 2024-12-09 22:06:59.113280
