数据处理

In [1]:
from datasets import load_dataset
test_dataset = load_dataset("YeungNLP/firefly-train-1.1M", split="train[:500]")

Repo card metadata block was not found. Setting CardData to empty.
Using the latest cached version of the dataset since YeungNLP/firefly-train-1.1M couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\xlf\.cache\huggingface\datasets\YeungNLP___firefly-train-1.1_m\default\0.0.0\92947564f0b6bac44c405272df8cd7247937fc2d (last modified on Wed Oct 30 10:43:04 2024).


In [2]:
test_dataset

Dataset({
    features: ['kind', 'input', 'target'],
    num_rows: 500
})

In [3]:
test_dataset[100]

{'kind': 'ClassicalChinese',
 'input': '我当时在三司，访求太祖、仁宗的手书敕令没有见到，然而人人能传诵那些话，禁止私盐的建议也最终被搁置。\n翻译成文言文：',
 'target': '余时在三司，求访两朝墨敕不获，然人人能诵其言，议亦竟寝。'}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("F:\llm_work\Model\Qwen2___5-0___5B-Instruct")

def format_prompt(example):
    chat = [
        {"role": "system", "content": "你是一个人工智能助手，是up主“小明”开发的."},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["target"]}
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {"text": prompt}

dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
dataset


Dataset({
    features: ['text'],
    num_rows: 500
})

In [5]:
dataset[0]

{'text': '<|im_start|>system\n你是一个人工智能助手，是up主“小明”开发的.<|im_end|>\n<|im_start|>user\n自然语言推理：\n前提：家里人心甘情愿地养他,还有几家想让他做女婿的\n假设：他是被家里人收养的孤儿<|im_end|>\n<|im_start|>assistant\n中立<|im_end|>\n'}

In [6]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("F:\llm_work\Model\Qwen2___5-0___5B-Instruct").half()

tokenizer = AutoTokenizer.from_pretrained("F:\llm_work\Model\Qwen2___5-0___5B-Instruct")
tokenizer.padding_side = "right"



In [7]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=64,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj","v_proj"]
)

model = get_peft_model(model, peft_config)


In [8]:
from transformers import Trainer, TrainingArguments

output_dir = "./outputs"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    save_steps=50
)


In [9]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    dataset_text_field="text",
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config
)

trainer.train()

trainer.model.save_pretrained("./result/final_model")




Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


  0%|          | 0/125 [00:00<?, ?it/s]

{'loss': 3.9053, 'grad_norm': 2.084488868713379, 'learning_rate': 0.00019745268727865774, 'epoch': 0.08}
{'loss': 3.2673, 'grad_norm': 2.499664783477783, 'learning_rate': 0.00018881364488135448, 'epoch': 0.16}
{'loss': 3.0687, 'grad_norm': 1.2388889789581299, 'learning_rate': 0.00017459411454241822, 'epoch': 0.24}
{'loss': 2.8596, 'grad_norm': 1.0219742059707642, 'learning_rate': 0.00015568756164881882, 'epoch': 0.32}
{'loss': 2.6614, 'grad_norm': 1.2218854427337646, 'learning_rate': 0.00013328195445229868, 'epoch': 0.4}
{'loss': 2.7552, 'grad_norm': 1.202169418334961, 'learning_rate': 0.00010878511965507434, 'epoch': 0.48}
{'loss': 2.4015, 'grad_norm': 1.163712739944458, 'learning_rate': 8.373628348051165e-05, 'epoch': 0.56}
{'loss': 2.5993, 'grad_norm': 1.6480128765106201, 'learning_rate': 5.9709356428633746e-05, 'epoch': 0.64}
{'loss': 2.4824, 'grad_norm': 1.0274461507797241, 'learning_rate': 3.821403869096658e-05, 'epoch': 0.72}
{'loss': 2.5851, 'grad_norm': 1.4728624820709229, 'le

In [13]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "./result/final_model",
    device_map="auto"
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

In [15]:
from transformers import pipeline

pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)

prompt_example = """<|im_start|>system
你是一个人工智能助手，是up主“小明”开发的.<|im_end|>
<|im_start|>user
我喜欢游泳，但不喜欢吃鱼。
翻译成文言文：<|im_end|>
<|im_start|>assistant
"""

print(pipe(prompt_example, max_new_tokens=50)[0]["generated_text"])

<|im_start|>system
你是一个人工智能助手，是up主“小明”开发的.<|im_end|>
<|im_start|>user
我喜欢游泳，但不喜欢吃鱼。
翻译成文言文：<|im_end|>
<|im_start|>assistant
吾喜游水，然恶食鱼。


In [20]:
prompt = "我喜欢游泳，但不喜欢吃鱼。\n翻译成文言文："
messages = [
    {"role": "system", "content": "你是一个人工智能助手，是up主“小明”开发的."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
print(text)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

<|im_start|>system
你是一个人工智能助手，是up主“小明”开发的.<|im_end|>
<|im_start|>user
我喜欢游泳，但不喜欢吃鱼。
翻译成文言文：<|im_end|>
<|im_start|>assistant



'吾喜泳而不厌，然好食鱼。'

: 