# 👾Qwen2大模型微调入门

作者：林泽毅

教程文章：https://zhuanlan.zhihu.com/p/702491999  

显存要求：10GB左右  

实验过程看：https://swanlab.cn/@ZeyiLin/Qwen2-fintune/runs/cfg5f8dzkp6vouxzaxlx6/chart

## 1.安装环境1

本案例测试于modelscope==1.14.0、transformers==4.41.2、datasets==2.18.0、peft==0.11.1、accelerate==0.30.1、swanlab==0.3.9

In [None]:
%pip install torch swanlab modelscope transformers datasets peft pandas accelerate

如果是第一次使用SwanLab，则前往[SwanLab](https://swanlab.cn)注册账号后，在[用户设置](https://swanlab.cn/settings/overview)复制API Key，如果执行下面的代码：

## 2. 数据集加载

1. 在[zh_cls_fudan-news - modelscope](https://modelscope.cn/datasets/huangjintao/zh_cls_fudan-news/files)下载train.jsonl和test.jsonl到同级目录下。

<img src="../assets/dataset.png" width=600>

2. 将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl

In [5]:
import pandas as pd
import json
import os

def dataset_tsv_to_jsonl(origin_path, new_path):
    """
    将 TSV 数据集转换为 JSONL 格式
    """
    df = pd.read_csv(origin_path, sep='\t')

    # 定义要保存的消息结构
    messages = []

    for index, row in df.iterrows():
        message = {
            "instruction": "根据以下评论内容，预测评论者的个性特质。",
            "input": row['comment'],
            "output": {
                "personality_conscientiousness": row['personality_conscientiousness'],
                "personality_openess": row['personality_openess'],
                "personality_extraversion": row['personality_extraversion'],
                "personality_agreeableness": row['personality_agreeableness'],
                "personality_stability": row['personality_stability']
            }
        }
        messages.append(message)

    # 保存重构后的 JSONL 文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")

# 设置原始 TSV 文件和目标 JSONL 文件的路径
train_tsv_path = "/home/wangyanan/transformer-code1/transformers-code/13-qwen2.5-7b/dataset/train.tsv"  # 假设 TSV 文件位于同一目录下
new_train_path = "/home/wangyanan/transformer-code1/transformers-code/13-qwen2.5-7b/dataset/new_personality_train.jsonl"


# test_tsv_path = "/kaggle/input/qwen25/test.tsv"  # 假设 TSV 文件位于同一目录下
# new_test_path = "/kaggle/input/qwen25/new_personality_test.jsonl"
# 检查目标文件是否存在，如果不存在，则执行转换
if not os.path.exists(new_train_path):
    dataset_tsv_to_jsonl(train_tsv_path, new_train_path)

# if not os.path.exists(new_test_path):
#     dataset_tsv_to_jsonl(test_tsv_path, new_test_path)

train_df = pd.read_json(new_train_path, lines=True)[:792]  # 取前1000条做训练（可选）
# test_df = pd.read_json(new_test_path, lines=True)[:10]  # 取前10条做主观评测


## 3. 下载/加载模型和tokenizer

In [3]:
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch

# 在modelscope上下载Qwen模型到本地目录下
model_dir = snapshot_download("Qwen/Qwen2.5-7B", cache_dir="./", revision="master")

# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained("./Qwen/Qwen2___5-7B/", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("./Qwen/Qwen2___5-7B/", device_map="auto", torch_dtype=torch.bfloat16)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.40s/it]


## 4. 预处理训练数据

In [7]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

NameError: name 'process_func' is not defined

In [None]:
import torch
from transformers import AutoTokenizer

# # 初始化tokenizer，这里需要您根据实际情况替换为您的模型对应的tokenizer
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384
    input_ids, attention_mask = [], []
    
    # 构建指令部分
    instruction = tokenizer(
        f"<|im_start|>system\n{example['instruction']}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    
    # 构建响应部分，即输出部分
    response = tokenizer("", add_special_tokens=False)  # 这里不需要再次进行tokenizer处理
    
    # 合并指令和响应的input_ids和attention_mask
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    
    # 如果长度超过MAX_LENGTH，进行截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
    
    # 将labels转换为浮点数类型的张量
    labels = []
    for key in ['personality_conscientiousness', 'personality_openess', 'personality_extraversion', 'personality_agreeableness', 'personality_stability']:
        try:
            labels.append(float(example['output'][key]))
        except (ValueError, KeyError):
            # 如果转换失败或键不存在，可以用0或平均值填充，或者选择跳过这个样本
            labels.append(0)  # 或者选择其他合适的默认值
    
    labels = torch.tensor(labels, dtype=torch.float)
    
    return {
        "input_ids": input_ids + [tokenizer.pad_token_id],  # 添加padding
        "attention_mask": attention_mask + [1],  # 添加padding
        "labels": labels
    }

# 示例数据
example = {
    "instruction": "根据以下评论内容，预测评论者的个性特质。",
    "input": "It breaks my heart to see people living in those conditions...",
    "output": {
        "personality_conscientiousness": "7",
        "personality_openess": "5.5",
        "personality_extraversion": "1",
        "personality_agreeableness": "6.5",
        "personality_stability": "6"
    }
}

# 测试process_func函数
processed_example = process_func(example)
print(processed_example)

## 5. 设置LORA

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1,  # Dropout 比例
)

model = get_peft_model(model, config)

## 6. 训练

In [None]:
args = TrainingArguments(
    output_dir="./output/Qwen2-personality",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=111,
    logging_steps=10,
    num_train_epochs=4,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
   
)

In [None]:
from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-fintune",
    experiment_name="Qwen2-1.5B-Instruct",
    description="使用通义千问Qwen2-1.5B-Instruct模型在personality数据集上微调。",
    config={
        "model": "qwen/Qwen2-1.5B-Instruct",
        "dataset": "illusion/personality",
    },
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

trainer.train()

In [None]:
# ====== 训练结束后的预测 ===== #

def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)

    return response
    

test_text_list = []
for index, row in test_df.iterrows():
    instruction = row["instruction"]
    input_value = row["input"]

    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"},
    ]

    response = predict(messages, model, tokenizer)
    messages.append({"role": "assistant", "content": f"{response}"})
    result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
    test_text_list.append(swanlab.Text(result_text, caption=response))

swanlab.log({"Prediction": test_text_list})
swanlab.finish()

In [None]:
=======
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 假设模型和分词器已经被正确初始化
# model = AutoModelForSequenceClassification.from_pretrained('model_name')
# tokenizer = AutoTokenizer.from_pretrained('model_name')

def predict(input_text, model, tokenizer):
    """
    使用模型预测性格特质评分
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(input_ids['input_ids'])
        logits = outputs.logits
        predictions = torch.softmax(logits, dim=-1)
    return predictions.cpu().numpy()[0]

# 测试数据
predictions_list = []
for index, row in test_df.iterrows():
    input_text = row['comment']
    predictions = predict(input_text, model, tokenizer)
    
    # 将预测结果添加到列表
    predictions_list.append({
        "personality_conscientiousness": predictions[0],
        "personality_openess": predictions[1],
        "personality_extraversion": predictions[2],
        "personality_agreeableness": predictions[3],
        "personality_stability": predictions[4]
    })

# 将预测结果转换为 DataFrame
predictions_df = pd.DataFrame(predictions_list)

# 保存 DataFrame 为 TSV 文件
output_file_path = "/home/wangyanan/transformer-code1/transformers-code/12-qwen2.5-7b/Qwen/LLM-Finetune/dataset/predictions.tsv"
predictions_df.to_csv(output_file_path, sep='\t', index=False)

print(f"预测结果已保存至：{output_file_path}")

In [128]:
from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-fintune",
    experiment_name="Qwen2-1.5B-Instruct",
    description="使用通义千问Qwen2-1.5B-Instruct模型在personality数据集上微调。",
    config={
        "model": "qwen/Qwen2-1.5B-Instruct",
        "dataset": "illusion/personality",
    },
)

In [None]:
import pandas as pd
import json
import os
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def dataset_tsv_to_jsonl(origin_path, new_path):
    df = pd.read_csv(origin_path, sep='\t')
    messages = []
    for index, row in df.iterrows():
        message = {
            "instruction": "根据以下评论内容，预测评论者的个性特质。",
            "input": row['comment'],
            "output": {
                "personality_conscientiousness": row['personality_conscientiousness'],
                "personality_openess": row['personality_openess'],
                "personality_extraversion": row['personality_extraversion'],
                "personality_agreeableness": row['personality_agreeableness'],
                "personality_stability": row['personality_stability']
            }
        }
        messages.append(message)
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")

train_tsv_path = "/home/wangyanan/transformer-code1/transformers-code/13-qwen2.5-7b/dataset/train.tsv"
new_train_path = "/home/wangyanan/transformer-code1/transformers-code/13-qwen2.5-7b/dataset/new_personality_train.jsonl"
if not os.path.exists(new_train_path):
    dataset_tsv_to_jsonl(train_tsv_path, new_train_path)

train_df = pd.read_json(new_train_path, lines=True)

class PersonalityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        inputs = self.tokenizer(
            item['input'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        labels = []
        for key in ['personality_conscientiousness', 'personality_openess', 'personality_extraversion', 'personality_agreeableness', 'personality_stability']:
            try:
                labels.append(float(item['output'][key]))
            except (ValueError, KeyError):
                labels.append(0)
        labels = torch.tensor(labels, dtype=torch.float)
        return {
            "input_ids": inputs['input_ids'].flatten(),
            "attention_mask": inputs['attention_mask'].flatten(),
            "labels": labels
        }

# 加载预训练模型和tokenizer


# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# 现在模型和分词器已经准备好，可以用于进一步的处理或训练。

# 替换最后的输出层以适应回归任务
model.classifier = nn.Linear(model.classifier.in_features, 5)  # 假设有5个个性特质分数

max_length = 384
dataset = PersonalityDataset(train_df, tokenizer, max_length)
batch_size = 8
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

args = TrainingArguments(
    output_dir="./output/Qwen2-personality",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=None,  # 使用默认的数据整理器
)

trainer.train()
swanlab.log({"Prediction": test_text_list})
swanlab.finish()

In [None]:
import pandas as pd
import json
import os

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
