In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # 添加 pad token

model = AutoModelForCausalLM.from_pretrained(model_name)
# 由于添加了新 token，需调整模型的词表大小
model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [17]:
from datasets import load_dataset
import json


# 假设你已将 Alpaca 指令数据集的小规模子集保存为 JSON 文件
data_files = {"train": "./alpaca_subset.json"}
dataset = load_dataset("json", data_files=data_files)["train"]

def preprocess_function(example):
    # 将 'instruction' 转换为字符串
    instruction = example["instruction"]
    if isinstance(instruction, list):
        instruction = " ".join(instruction)
    
    # 将 'output' 转换为字符串
    output = example["output"]
    if isinstance(output, list):
        output = " ".join(output)
    
    # 获取 'input' 字段，并转换为字符串（如果是列表）
    input_field = example.get("input", "")
    if isinstance(input_field, list):
        input_field = " ".join(input_field)
    
    # 根据 'input' 是否为空，选择合适的模板
    if input_field.strip():
        full_text = f"Instruction: {instruction}\nInput: {input_field}\nResponse: {output}"
    else:
        full_text = f"Instruction: {instruction}\nResponse: {output}"
    
    # 对生成的文本进行分词
    tokenized = tokenizer(full_text, truncation=True, max_length=256)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# 使用 map 方法对数据集进行预处理，并删除原始字段
dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names  # 删除原始字段
)

Map: 100%|██████████| 100/100 [00:00<00:00, 4892.17 examples/s]


In [18]:
print(dataset[:5])

{'input_ids': [6310, 2762, 25, 14026, 502], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [6310, 2762, 25, 14026, 502]}


In [19]:
from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

# 配置 LoRA 参数：例如 rank=4, alpha=16, dropout=0.1
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["c_attn"],  # 根据模型结构指定需要应用 LoRA 的层名称
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# 将预训练模型包装成 LoRA 模型
peft_model = get_peft_model(model, lora_config)

# 指定训练参数
training_args = TrainingArguments(
    output_dir="./qlora_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,  # 根据 MacBook Air 的内存适当调低
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="no",
    fp16=False,  # MacBook Air 一般不支持 fp16 加速
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,6.6355
20,6.4049
30,6.3927
40,7.2111
50,6.9811
60,7.1805
70,7.3803
80,6.5596
90,7.4611
100,6.9459




TrainOutput(global_step=192, training_loss=6.844680259625117, metrics={'train_runtime': 14.6562, 'train_samples_per_second': 52.401, 'train_steps_per_second': 13.1, 'total_flos': 196312301568.0, 'train_loss': 6.844680259625117, 'epoch': 3.0})