In [2]:
from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,DataCollatorForSeq2Seq,TrainingArguments,Trainer

In [3]:
ds = Dataset.load_from_disk("./alpaca_data_zh/")
ds

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-1b4-zh")
tokenizer

BloomTokenizerFast(name_or_path='Langboat/bloom-1b4-zh', vocab_size=46145, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [5]:
def process_fun(example):
    MAX_LENGTH = 256
    input_ids,attention_mask,labels = {},{},{}
    instruction = "\n".join(["Human:" + example["instruction"],example["input"]]).strip() + "\n\nAssistant:"
    instruction = tokenizer(instruction)
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100]*len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [6]:
tokenized_ds = ds.map(process_fun, remove_columns=ds.column_names)
tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26858
})

In [7]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh")

# prefix_tuning 配置

In [8]:
from peft import PrefixTuningConfig,TaskType,get_peft_model
config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM,num_virtual_tokens=10,prefix_projection=True)

In [9]:
config

PrefixTuningConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, peft_version='0.18.0', base_model_name_or_path=None, revision=None, inference_mode=False, num_virtual_tokens=10, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, modules_to_save=None, encoder_hidden_size=None, prefix_projection=True)

In [10]:
model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 205,641,728 || all params: 1,508,753,408 || trainable%: 13.6299


In [11]:
model.prompt_encoder
#93081对应layer*2*hidden_size，也就是每一层都有k和v，每个都是hidden_size个

ModuleDict(
  (default): PrefixEncoder(
    (embedding): Embedding(10, 2048)
    (transform): Sequential(
      (0): Linear(in_features=2048, out_features=2048, bias=True)
      (1): Tanh()
      (2): Linear(in_features=2048, out_features=98304, bias=True)
    )
  )
)

In [12]:
args = TrainingArguments(
    output_dir="./chatbot_prefix",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=10,
)
#model.config.use_cache = False

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

  trainer = Trainer(


In [14]:

trainer.train()



Step,Training Loss
10,3.1758
20,2.6821
30,2.6418
40,2.5242
50,2.4693
60,2.4559
70,2.3311
80,2.4159
90,2.2457
100,2.3753


KeyboardInterrupt: 

In [3]:
import torch
from peft import PeftModel
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"正在使用设备: {device}")
save_path = "./chatbot_prefix/checkpoint-100/"
base_model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh")
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = PeftModel.from_pretrained(base_model,model_id=save_path)
# 6. 移动到显卡
model = model.to(device)
model.eval() # 切换到推理模式

print("模型加载完毕，随时可以调用！")

正在使用设备: mps
模型加载完毕，随时可以调用！


In [4]:
ipt = tokenizer("Human: {}\n{}".format("考试有哪些技巧？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **ipt, 
        max_new_tokens=128, 
        do_sample=True,
        top_p=0.85,
        temperature=0.35,
        repetition_penalty=1.2
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Human: 考试有哪些技巧？

Assistant: 英语的阅读和写作能力是衡量学生是否具备学习语言技能的重要指标。在大学阶段，通过大量的练习来提高学生的词汇量和语法水平是非常重要的。此外，一些优秀的课程可以帮助您掌握必要的语法和单词知识。
例如，如果您想获得更高的学术成绩或参加研究生入学测试（例如GRE、GMAT），那么就需要了解更多的专业术语和相关概念以及相关的背景知识和研究方法等内容才能更好地准备这些试题并取得理想的成绩。
