In [1]:
from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,DataCollatorForSeq2Seq,TrainingArguments,Trainer

In [2]:
ds = Dataset.load_from_disk("./alpaca_data_zh/")
ds

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-1b4-zh")
tokenizer

BloomTokenizerFast(name_or_path='Langboat/bloom-1b4-zh', vocab_size=46145, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
def process_fun(example):
    MAX_LENGTH = 256
    input_ids,attention_mask,labels = {},{},{}
    instruction = "\n".join(["Human:" + example["instruction"],example["input"]]).strip() + "\n\nAssistant:"
    instruction = tokenizer(instruction)
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100]*len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [5]:
tokenized_ds = ds.map(process_fun, remove_columns=ds.column_names)
tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26858
})

In [6]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh")

# p_tuning 配置

In [19]:
from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType
#encoder_dropout=0.1,encoder_num_layers=3只有设置lstm有用，mlp是定死的，只能改hidden_size
config = PromptEncoderConfig(task_type=TaskType.CAUSAL_LM,num_virtual_tokens=10,
                             encoder_reparameterization_type=PromptEncoderReparameterizationType.LSTM,
                             encoder_hidden_size=1024,encoder_dropout=0.1,encoder_num_layers=3)

In [20]:
config

PromptEncoderConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.P_TUNING: 'P_TUNING'>, auto_mapping=None, peft_version='0.18.0', base_model_name_or_path=None, revision=None, inference_mode=False, num_virtual_tokens=10, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, modules_to_save=None, encoder_reparameterization_type=<PromptEncoderReparameterizationType.LSTM: 'LSTM'>, encoder_hidden_size=1024, encoder_num_layers=3, encoder_dropout=0.1)

In [21]:
model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 83,959,808 || all params: 1,387,071,488 || trainable%: 6.0530


In [22]:
args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1
)

In [23]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

  trainer = Trainer(


In [24]:
trainer.train()



Step,Training Loss
10,2.7017
20,2.5818
30,2.5842
40,2.5113
50,2.4814
60,2.4957
70,2.3702
80,2.458
90,2.3135
100,2.4366


KeyboardInterrupt: 

In [28]:
save_directory = "./manual_save_result"
trainer.model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./manual_save_result/tokenizer_config.json',
 './manual_save_result/special_tokens_map.json',
 './manual_save_result/tokenizer.json')

In [29]:
import torch
from peft import PeftModel
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"正在使用设备: {device}")
save_path = "./manual_save_result"
base_model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh")
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = PeftModel.from_pretrained(base_model,model_id=save_path)
# 6. 移动到显卡
model = model.to(device)
model.eval() # 切换到推理模式

print("模型加载完毕，随时可以调用！")

正在使用设备: mps
模型加载完毕，随时可以调用！


In [32]:
ipt = tokenizer("Human: {}\n{}".format("考试有哪些技巧？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **ipt, 
        max_new_tokens=128, 
        do_sample=True,
        top_p=0.85,
        temperature=0.35,
        repetition_penalty=1.2
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Human: 考试有哪些技巧？

Assistant: 考生在准备考前应做好以下几点：
1. 注意休息和营养。 在复习阶段，要保证充足的睡眠时间、均衡的饮食以及适当的运动。
2 . 要合理安排作息时间和学习内容。 应根据自己的实际情况制定一个合理的计划表来完成每天的任务并保持良好的状态。
3 . 不要熬夜或通宵达旦地看书做题。 这样只会加重身体负担并且影响正常的工作与生活节奏；相反，可以适当放松一下自己以缓解压力。
4 . 多参加一些有益身心的活动如阅读书籍等有助于提高记忆力及思维能力；此外，还可以多听听音乐
