In [1]:
from utils import *
delete_cache()

import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoTokenizer
import os
import json

Deleting: __pycache__
All __pycache__ directories have been deleted.


In [2]:
PROMPT = "你是一个香港保险经纪人，你在考试，你需要正确回答考试题目。请注意，所有选择题都是单选题。回答格式举例。答案：E。解释：...（简单解释一下）"
MAX_LENGTH = 2048

model_name = 'Qwen3-1.7B'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    f"Qwen/{model_name}", 
    use_fast=False, 
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    f"Qwen/{model_name}", 
    device_map="auto", 
    dtype=torch.bfloat16
)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def process_func(example):
    """
    将数据集进行预处理
    """ 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n{PROMPT}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   

def dataset_jsonl_transfer(origin_path, new_path):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []

    # 读取旧的JSONL文件
    with open(origin_path, "r") as file:
        for line in file:
            # 解析每一行的json数据
            data = json.loads(line)
            input = data["question"]
            think = data["think"]
            answer = data["answer"]
            output = f"<think>{think}</think> \n {answer}"
            message = {
                "instruction": PROMPT,
                "input": f"{input}",
                "output": output,
            }
            messages.append(message)

    # 保存重构后的JSONL文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")
            

In [5]:
train_dataset_path = "train.jsonl"
test_dataset_path = "val.jsonl"
train_jsonl_new_path = "train_format.jsonl"
test_jsonl_new_path = "val_format.jsonl"

if not os.path.exists(train_jsonl_new_path):
    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
if not os.path.exists(test_jsonl_new_path):
    dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)

In [6]:
# 得到训练集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

# 得到验证集
eval_df = pd.read_json(test_jsonl_new_path, lines=True)
eval_ds = Dataset.from_pandas(eval_df)
eval_dataset = eval_ds.map(process_func, remove_columns=eval_ds.column_names)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [8]:
args = TrainingArguments(
    output_dir=f"./output/{model_name}",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=4,
    logging_steps=2,
    num_train_epochs=3,
    save_steps=16,
    learning_rate=5e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="wandb",
    run_name=model_name,
    load_best_model_at_end=True
)

In [9]:
from datetime import datetime
now_str = datetime.now().strftime("%y%m%d%H%M")
print(now_str)

import wandb
run = wandb.init(
    entity="kcmyteam",
    project=f"{model_name}-project",
    name=f"{args.run_name}_{now_str}",
    notes="Fine-tuning Qwen3 1.7B",
    tags=["qwen3-1.7B", "SFT"],
    job_type="train",

    config={
        "model_name": model_name,
        "training_method": "SFT",
        "learning_rate": args.learning_rate,
    },
)

2512130114


[34m[1mwandb[0m: Currently logged in as: [33mkratoschu[0m ([33mkcmyteam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
4,2.9581,2.171017
8,2.4214,1.484324
12,2.1928,1.320557
16,1.6481,1.257887
20,2.0017,1.214973
24,1.797,1.213902
28,1.1539,1.190584
32,1.0753,1.166571
36,1.3536,1.150552
40,1.1702,1.14419


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=75, training_loss=1.4408857520421345, metrics={'train_runtime': 157.855, 'train_samples_per_second': 1.881, 'train_steps_per_second': 0.475, 'total_flos': 550591672891392.0, 'train_loss': 1.4408857520421345, 'epoch': 3.0})

In [11]:
# 用测试集的前3条，主观看模型
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:3]

test_text_list = []

for index, row in test_df.iterrows():
    instruction = row['instruction']
    input_value = row['input']

    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"}
    ]

    response = predict(messages, model, tokenizer)

    response_text = f"""
    Question: {input_value}

    LLM:{response}
    """

    print(response_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



    Question: 下列哪項屬於自然保費制度的特點？
a)保費隨年齡增長而增加
b)會產生逆選擇
c)大部份保險公司已不採用
d)以上各項皆正確

    LLM:<think>自然保費制度的特點包括：保費隨年齡
增長而增加，會產生逆選擇，大部份
保險公司已不採用</think> 
 C
    

    Question: 以下哪項就自然保費釐定制度的描述不正確?
a)保險人已不再使用自然保費制度
b)該制度會出現不利於保險人的逆選擇
c)保費會隨年齡增長而增加
d)以上各項皆不是

    LLM:<think>自然保費釐定制度已不再使用，故選項A不正確</think> 
 A
    

    Question: 分擔與代位權：
a)不適用於人壽保險
b)對人壽保險的索償很重要
c)其重要性在人壽保險索償與一般保險索償一致
d)以上所有皆是

    LLM:<think>分擔與代位權適用於人壽保險</think> 
 D
    
