In [1]:
from utils import *
delete_cache()

import json
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoTokenizer
import os
# load_dataset()

Deleting: __pycache__
All __pycache__ directories have been deleted.


In [None]:
PROMPT = "你是一个香港保险经纪人，你在考试，你需要正确回答考试题目。回答格式举例。答案：C。解释：...（简单解释一下）"
MAX_LENGTH = 2048

model_name = 'Qwen3-1.7B'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    f"Qwen/{model_name}", 
    use_fast=False, 
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    f"Qwen/{model_name}", 
    device_map="auto", 
    dtype=torch.bfloat16
)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def process_func(example):
    """
    将数据集进行预处理
    """ 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|im_start|>system\n{PROMPT}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   

def dataset_jsonl_transfer(origin_path, new_path):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []

    # 读取旧的JSONL文件
    with open(origin_path, "r") as file:
        for line in file:
            # 解析每一行的json数据
            data = json.loads(line)
            input = data["question"]
            think = data["think"]
            answer = data["answer"]
            output = f"<think>{think}</think> \n {answer}"
            message = {
                "instruction": PROMPT,
                "input": f"{input}",
                "output": output,
            }
            messages.append(message)

    # 保存重构后的JSONL文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")
            

In [5]:
train_dataset_path = "train.jsonl"
test_dataset_path = "val.jsonl"
train_jsonl_new_path = "train_format.jsonl"
test_jsonl_new_path = "val_format.jsonl"

if not os.path.exists(train_jsonl_new_path):
    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
if not os.path.exists(test_jsonl_new_path):
    dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)

In [6]:
# 得到训练集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

# 得到验证集
eval_df = pd.read_json(test_jsonl_new_path, lines=True)
eval_ds = Dataset.from_pandas(eval_df)
eval_dataset = eval_ds.map(process_func, remove_columns=eval_ds.column_names)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [7]:
args = TrainingArguments(
    output_dir=f"./output/{model_name}",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=5,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=10,
    learning_rate=5e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="wandb",
    run_name=model_name,
)

In [8]:
from datetime import datetime
now_str = datetime.now().strftime("%y%m%d%H%M")
print(now_str)

import wandb
run = wandb.init(
    entity="kcmyteam",
    project=f"{model_name}-project",
    name=f"{args.run_name}_{now_str}",
    notes="Fine-tuning Qwen3 1.7B",
    tags=["qwen3-1.7B", "SFT"],
    job_type="train",

    config={
        "model_name": model_name,
        "training_method": "SFT",
        "learning_rate": args.learning_rate,
        "epochs": 2,
    },
)

2512121809


[34m[1mwandb[0m: Currently logged in as: [33mkratoschu[0m ([33mkcmyteam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,No log,2.778255
10,2.662800,2.791492
15,2.662800,2.783824
20,1.719500,2.790395
25,1.719500,2.767548
30,1.197400,2.74862
35,1.197400,2.751687
40,0.987100,2.761685
45,0.987100,2.765748
50,1.027700,2.766845


TrainOutput(global_step=50, training_loss=1.5189008712768555, metrics={'train_runtime': 149.701, 'train_samples_per_second': 1.323, 'train_steps_per_second': 0.334, 'total_flos': 268644830429184.0, 'train_loss': 1.5189008712768555, 'epoch': 2.0})

In [10]:
# 用测试集的前3条，主观看模型
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:3]

test_text_list = []

for index, row in test_df.iterrows():
    instruction = row['instruction']
    input_value = row['input']

    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"}
    ]

    response = predict(messages, model, tokenizer)

    response_text = f"""
    Question: {input_value}

    LLM:{response}
    """

    print(response_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



    Question: 下列哪項可被稱為人壽保險中
的「商業需求」而非「個人需
求」？
只供保誠保險內部使用，不得發放予任何公眾人士、客戶或準客戶，包括網上平台或電子媒介。
12
i) 關鍵人物
ii) 企業擁有人
iii)退休收入
iv) 僱員福利
a i, ii, iii
b i, ii, iv
c i, iii, iv
d ii, iii, iv

    LLM:<think>關鍵人物是指保單持有人
在個人生活或商業上重要或關
系的人，可被稱為人壽保險中
的「商業需求」而非「個人需
求」</think> 
 C
    

    Question: 假若保險人不把投保人正感到
胃痛一事列作重要事實，其原
因必定是：
a 胃痛是可以透過身體檢查
發覺的
b 這與投保人的健康狀況沒
有關係
c 收取附加保費就可以接受這
項投保
d 這事實不會影響保險人決定

    LLM:<think>這項事實不會影響保險人決定</think> 
 D
    

    Question: 人壽保險採用均衡保費制度,
每期支付保費均相同，並假
設：
a 利率回報不變
b 死亡保險金不變
c 經濟氣候安穩
d 保險人對死亡索償預計不變

    LLM:<think>均衡保費制度是人壽保險中
最常見的保費計算方法，其特
點是每期支付保費均相同，並
假設死亡保險金不變，經濟氣
候安穩，保險人對死亡索償預
計不變</think> 
 C
    
