In [20]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model
import json
from datasets import Dataset

In [4]:
import torch

# 加载模型和 tokenizer
model_path = "/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2___5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]


In [5]:
# LoRA 配置
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules="all-linear",  # 根据模型需要调整
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [11]:
# 加载数据
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

dataset = load_data("../data/test.jsonl")
print(dataset[:3])

[{'messages': [{'role': 'user', 'content': '由于加强了生产过程中的生态环境监控，该基地每年的无公害蔬菜的生产量除供应本省主要市场外，还销往河南、河北等省。'}, {'role': 'assistant', 'content': '由于加强了生产过程中的生态环境监控，该基地每年~~的无公害蔬菜的~~生~~产量~~##的无公害蔬菜的##量除供应本省主要市场外，还销往河南、河北等省。'}], 'source': 'FCGEC EMNLP 2022', 'domain': 'Todo', 'input_token_len': 65, 'language': 'zh', 'modified_times': 3}, {'messages': [{'role': 'user', 'content': '从根本上说，科技发展、经济的振兴，乃至整个社会的进步，主要原因是劳动者的高素质和大量优秀人才的培养决定的。'}, {'role': 'assistant', 'content': '从根本上说，科技发展、经济的振兴，乃至整个社会的进步，主要原因是劳动者的高素质和大量优秀人才的培养~~决定的~~。'}], 'source': 'FCGEC EMNLP 2022', 'domain': 'Todo', 'input_token_len': 65, 'language': 'zh', 'modified_times': 1}, {'messages': [{'role': 'user', 'content': '据《北京青年报》10月22日报道，国务院学位委员会撤销了全国25个省份175所高校的576个学位点，包括部分博士学位。'}, {'role': 'assistant', 'content': '据《北京青年报》10月22日报道，国务院学位委员会撤销了全国25个省份175所高校的576个学位点，包括部分博士学位##授权点##。'}], 'source': 'FCGEC EMNLP 2022', 'domain': 'Todo', 'input_token_len': 57, 'language': 'zh', 'modified_times': 1}]


In [12]:
# 转换数据格式
def to_prompt(example):
    messages = example["messages"]
    prompt = ""
    for m in messages:
        prompt += f"{m['role']}: {m['content']}\n"
    return {"text": prompt.strip()}

processed = list(map(to_prompt, dataset))
print(processed[:2])

[{'text': 'user: 由于加强了生产过程中的生态环境监控，该基地每年的无公害蔬菜的生产量除供应本省主要市场外，还销往河南、河北等省。\nassistant: 由于加强了生产过程中的生态环境监控，该基地每年~~的无公害蔬菜的~~生~~产量~~##的无公害蔬菜的##量除供应本省主要市场外，还销往河南、河北等省。'}, {'text': 'user: 从根本上说，科技发展、经济的振兴，乃至整个社会的进步，主要原因是劳动者的高素质和大量优秀人才的培养决定的。\nassistant: 从根本上说，科技发展、经济的振兴，乃至整个社会的进步，主要原因是劳动者的高素质和大量优秀人才的培养~~决定的~~。'}]


In [13]:
# 转换为 Hugging Face Dataset
ds = Dataset.from_list(processed)
print(ds)

Dataset({
    features: ['text'],
    num_rows: 10
})


In [23]:
# 配置训练参数
training_args = SFTConfig(
    seed = 42,
    output_dir="../output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_dir='../logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    bf16=True,
    report_to="tensorboard",  # 使用 TensorBoard 来报告日志
    logging_first_step=True,
    max_length = 1024,
    max_seq_length = 1024,
    label_names=["labels"],
)

In [24]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 设置 SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    args=training_args,
    data_collator=data_collator,
)

Converting train dataset to ChatML: 100%|██████████| 10/10 [00:00<00:00, 3017.05 examples/s]
Applying chat template to train dataset: 100%|██████████| 10/10 [00:00<00:00, 3577.23 examples/s]
Tokenizing train dataset: 100%|██████████| 10/10 [00:00<00:00, 1161.50 examples/s]
Truncating train dataset: 100%|██████████| 10/10 [00:00<00:00, 3456.94 examples/s]


In [25]:
# 启动训练
trainer.train()

# 保存adapter
trainer.model.save_pretrained("./output")

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss
