In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"


cache_dir = "/root/autodl-tmp"


In [2]:

import transformers
from datasets import Dataset
from transformers import AutoTokenizer,  AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer


In [3]:

ds = Dataset.load_from_disk("./alpaca_data_zh/")
ds

ds[:1]





{'output': ['以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。'],
 'input': [''],
 'instruction': ['保持健康的三个提示。']}

In [4]:


tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-1b4-zh", cache_dir=os.path.join(cache_dir, "bloom-1b4-zh"))
tokenizer


BloomTokenizerFast(name_or_path='Langboat/bloom-1b4-zh', vocab_size=46145, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:

def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ")
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }



In [6]:

tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26858
})

In [7]:

print(tokenizer.decode(tokenized_ds[1]["input_ids"]))
print(tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"]))))





Human: 解释为什么以下分数等同于1/4
输入：4/16

Assistant: 4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4，得到（4÷4）/ (16÷4）=1/4。分数的约分是用分子和分母除以相同的非零整数，来表示分数的一个相同的值，这因为分数实际上表示了分子除以分母，所以即使两个数同时除以同一个非零整数，分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式，但它们的值相等。</s>
4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4，得到（4÷4）/ (16÷4）=1/4。分数的约分是用分子和分母除以相同的非零整数，来表示分数的一个相同的值，这因为分数实际上表示了分子除以分母，所以即使两个数同时除以同一个非零整数，分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式，但它们的值相等。</s>


In [8]:


model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh", low_cpu_mem_usage=True, cache_dir = os.path.join(cache_dir, "bloom-1b4-zh"))



In [9]:

# 实现Prompt Tuning fine tuning

from peft import PromptTuningConfig, PromptTuningInit, TaskType, get_peft_model

# soft prompt
config = PromptTuningConfig(task_type=TaskType.CAUSAL_LM,num_virtual_tokens=10)

# hard prompt

config = PromptTuningConfig(task_type=TaskType.CAUSAL_LM,
                           prompt_tuning_init=PromptTuningInit.TEXT,
                           prompt_tuning_init_text="下面是一段人与机器人的对话。",
                           num_virtual_tokens=len(tokenizer("下面是一段人与机器人的对话。")["input_ids"]),
                            # 传入tokenizer对应model_name, 是因为hard prompt需要进行tokenzier转化
                           tokenizer_name_or_path="Langboat/bloom-1b4-zh") 

config






PromptTuningConfig(peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, num_virtual_tokens=8, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, prompt_tuning_init=<PromptTuningInit.TEXT: 'TEXT'>, prompt_tuning_init_text='下面是一段人与机器人的对话。', tokenizer_name_or_path='Langboat/bloom-1b4-zh', tokenizer_kwargs=None)

In [10]:
# 创建Prompt Tuning model的模型

model = get_peft_model(model=model,peft_config=config,adapter_name="default", mixed=False)


model


PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(46145, 2048)
      (word_embeddings_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
            (dense): Linear(in_features=2048, out_features=2048, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          )
        )
      )

In [11]:

model.print_trainable_parameters()




trainable params: 16,384 || all params: 1,303,128,064 || trainable%: 0.0013


In [12]:


args = TrainingArguments(output_dir="/root/autodl-tmp/prompt_tuning_bloom_1b4",
                        per_device_train_batch_size=8,
                         gradient_accumulation_steps=2,
                        learning_rate=1e-5,
                        logging_steps=10,
                        num_train_epochs=1,)

trainer = Trainer(model=model,args = args,train_dataset=tokenized_ds, data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))

trainer.train()



Step,Training Loss
10,2.7567
20,2.8063
30,2.7418
40,2.6965
50,2.6903
60,2.6993
70,2.7426
80,2.7506
90,2.802
100,2.7712




config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]



TrainOutput(global_step=1679, training_loss=2.607647605564283, metrics={'train_runtime': 794.4025, 'train_samples_per_second': 33.809, 'train_steps_per_second': 2.114, 'total_flos': 2.673169270505472e+16, 'train_loss': 2.607647605564283, 'epoch': 1.0})

In [13]:

model.save_pretrained(save_directory="/root/autodl-tmp/prompt_tuning_bloom_1b4/save_pretrained")







In [14]:


type(model)


peft.peft_model.PeftModelForCausalLM

In [15]:
trainer.save_model("/root/autodl-tmp/bloom-fined-tuning")

