## 1. Load modules

In [1]:
from datasets import load_dataset

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

In [3]:
import datasets

In [4]:
datasets.__version__

'3.6.0'

In [5]:
import warnings
warnings.filterwarnings('ignore')

## 2. Load data

In [6]:
ds = load_dataset("json", data_dir = "./alpaca_data_zh/")
ds = ds["train"]

In [7]:
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 48818
})

In [8]:
ds[:3]

{'instruction': ['保持健康的三个提示。', '三原色是什么？', '描述原子的结构。'],
 'input': ['', '', ''],
 'output': ['以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。',
  '三原色通常指的是红色、绿色和蓝色（RGB）。它们是通过加色混合原理创建色彩的三种基础颜色。在以发光为基础的显示设备中（如电视、计算机显示器、智能手机和平板电脑显示屏）, 三原色可混合产生大量色彩。其中红色和绿色可以混合生成黄色，红色和蓝色可以混合生成品红色，蓝色和绿色可以混合生成青色。当红色、绿色和蓝色按相等比例混合时，可以产生白色或灰色。\n\n此外，在印刷和绘画中，三原色指的是以颜料为基础的红、黄和蓝颜色（RYB）。这三种颜色用以通过减色混合原理来创建色彩。不过，三原色的具体定义并不唯一，不同的颜色系统可能会采用不同的三原色。',
  '原子是物质的基本单位，它由三种基本粒子组成：质子、中子和电子。质子和中子形成原子核，位于原子中心，核外的电子围绕着原子核运动。\n\n原子结构具有层次性。原子核中，质子带正电，中子不带电（中性）。原子核非常小且致密，占据了原子总质量的绝大部分。电子带负电，通常围绕核运动，形成若干层次，称为壳层或电子层。电子数量与质子数量相等，使原子呈电中性。\n\n电子在每个壳层中都呈规律分布，并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子，其次一层最多可容纳8个电子，再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响：强力和电磁力。强力的作用范围非常小，主要限制在原子核内，具有极强的吸引作用，使核子（质子和中子）紧密结合在一起。电磁力的作用范围较大，主要通过核外的电子与原子核相互作用，发挥作用。\n\n这就是原子的基本结构。原子内部结构复杂多样，不同元素的原子核中质子、中子数量不同

## 3. Preprocess data

In [9]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-1b4-zh")

In [10]:
tokenizer

BloomTokenizerFast(name_or_path='Langboat/bloom-1b4-zh', vocab_size=46145, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], 
                                       example["input"]]).strip() + "\n\nAssistant: ")
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    """
    Use [-100] to ignore the prompt/instruction part during loss computation.
    The model still needs the instruction to generate the correct response.
    During training, the model gets the full input_ids (instruction + response).
    But it should only be evaluated (via loss) on the response.
    If you remove the instruction tokens from labels, then the alignment between input_ids and labels breaks —
    you’ll have mismatched sequence lengths, which causes errors.
    """
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return{
        "input_ids" : input_ids,
        "attention_mask" : attention_mask,
        "labels" : labels
    }

In [12]:
tokenized_ds = ds.map(process_func, remove_columns = ds.column_names)

In [13]:
tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48818
})

In [14]:
tokenized_ds[2]

{'input_ids': [26283,
  29,
  210,
  10096,
  1742,
  8328,
  7241,
  672,
  189,
  4340,
  17245,
  29,
  210,
  11392,
  584,
  10009,
  15139,
  7066,
  355,
  1954,
  1321,
  25020,
  5099,
  23972,
  7720,
  1038,
  2993,
  1020,
  554,
  655,
  27702,
  7964,
  420,
  2993,
  1020,
  24405,
  1020,
  6454,
  11392,
  3317,
  355,
  5699,
  11392,
  3669,
  355,
  3317,
  13589,
  7964,
  22273,
  2282,
  11392,
  3317,
  6053,
  672,
  189,
  11392,
  7241,
  5140,
  25008,
  1237,
  420,
  11392,
  3317,
  655,
  355,
  2993,
  1020,
  3099,
  1395,
  1936,
  355,
  39089,
  643,
  3099,
  1936,
  928,
  27810,
  927,
  420,
  11392,
  3317,
  4433,
  1225,
  2409,
  2880,
  3211,
  355,
  24588,
  658,
  11392,
  2333,
  8554,
  373,
  41727,
  420,
  7964,
  3099,
  4002,
  1936,
  355,
  6770,
  22273,
  3317,
  6053,
  355,
  6454,
  11915,
  25008,
  355,
  9833,
  15953,
  4673,
  1326,
  7964,
  4673,
  420,
  7964,
  10162,
  1235,
  2993,
  1020,
  10162,
  32775,
  355

In [15]:
tokenizer.decode(tokenized_ds[2]["input_ids"])

'Human: 描述原子的结构。\n\nAssistant: 原子是物质的基本单位，它由三种基本粒子组成：质子、中子和电子。质子和中子形成原子核，位于原子中心，核外的电子围绕着原子核运动。\n\n原子结构具有层次性。原子核中，质子带正电，中子不带电（中性）。原子核非常小且致密，占据了原子总质量的绝大部分。电子带负电，通常围绕核运动，形成若干层次，称为壳层或电子层。电子数量与质子数量相等，使原子呈电中性。\n\n电子在每个壳层中都呈规律分布，并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子，其次一层最多可容纳8个电子，再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响：强力和电磁力。强力的作用范围非常小，主要限制在原子核内，具有极强的吸引作用，使核子（质子和中子）紧密结合在一起。电磁力的作用范围较大，主要通过核外的电子与原子核相互作用，发挥作用。\n\n这就是原子的'

In [16]:
tokenizer.decode(list(filter(lambda x:x!=-100,tokenized_ds[2]["labels"])))

'原子是物质的基本单位，它由三种基本粒子组成：质子、中子和电子。质子和中子形成原子核，位于原子中心，核外的电子围绕着原子核运动。\n\n原子结构具有层次性。原子核中，质子带正电，中子不带电（中性）。原子核非常小且致密，占据了原子总质量的绝大部分。电子带负电，通常围绕核运动，形成若干层次，称为壳层或电子层。电子数量与质子数量相等，使原子呈电中性。\n\n电子在每个壳层中都呈规律分布，并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子，其次一层最多可容纳8个电子，再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响：强力和电磁力。强力的作用范围非常小，主要限制在原子核内，具有极强的吸引作用，使核子（质子和中子）紧密结合在一起。电磁力的作用范围较大，主要通过核外的电子与原子核相互作用，发挥作用。\n\n这就是原子的'

In [17]:
tokenized_ds[2]["input_ids"]

[26283,
 29,
 210,
 10096,
 1742,
 8328,
 7241,
 672,
 189,
 4340,
 17245,
 29,
 210,
 11392,
 584,
 10009,
 15139,
 7066,
 355,
 1954,
 1321,
 25020,
 5099,
 23972,
 7720,
 1038,
 2993,
 1020,
 554,
 655,
 27702,
 7964,
 420,
 2993,
 1020,
 24405,
 1020,
 6454,
 11392,
 3317,
 355,
 5699,
 11392,
 3669,
 355,
 3317,
 13589,
 7964,
 22273,
 2282,
 11392,
 3317,
 6053,
 672,
 189,
 11392,
 7241,
 5140,
 25008,
 1237,
 420,
 11392,
 3317,
 655,
 355,
 2993,
 1020,
 3099,
 1395,
 1936,
 355,
 39089,
 643,
 3099,
 1936,
 928,
 27810,
 927,
 420,
 11392,
 3317,
 4433,
 1225,
 2409,
 2880,
 3211,
 355,
 24588,
 658,
 11392,
 2333,
 8554,
 373,
 41727,
 420,
 7964,
 3099,
 4002,
 1936,
 355,
 6770,
 22273,
 3317,
 6053,
 355,
 6454,
 11915,
 25008,
 355,
 9833,
 15953,
 4673,
 1326,
 7964,
 4673,
 420,
 7964,
 10162,
 1235,
 2993,
 1020,
 10162,
 32775,
 355,
 1408,
 11392,
 8168,
 1936,
 27810,
 672,
 189,
 7964,
 587,
 9993,
 15953,
 4673,
 33629,
 8168,
 22003,
 10740,
 355,
 6187,
 3657,


In [18]:
tokenized_ds[2]["labels"]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 11392,
 584,
 10009,
 15139,
 7066,
 355,
 1954,
 1321,
 25020,
 5099,
 23972,
 7720,
 1038,
 2993,
 1020,
 554,
 655,
 27702,
 7964,
 420,
 2993,
 1020,
 24405,
 1020,
 6454,
 11392,
 3317,
 355,
 5699,
 11392,
 3669,
 355,
 3317,
 13589,
 7964,
 22273,
 2282,
 11392,
 3317,
 6053,
 672,
 189,
 11392,
 7241,
 5140,
 25008,
 1237,
 420,
 11392,
 3317,
 655,
 355,
 2993,
 1020,
 3099,
 1395,
 1936,
 355,
 39089,
 643,
 3099,
 1936,
 928,
 27810,
 927,
 420,
 11392,
 3317,
 4433,
 1225,
 2409,
 2880,
 3211,
 355,
 24588,
 658,
 11392,
 2333,
 8554,
 373,
 41727,
 420,
 7964,
 3099,
 4002,
 1936,
 355,
 6770,
 22273,
 3317,
 6053,
 355,
 6454,
 11915,
 25008,
 355,
 9833,
 15953,
 4673,
 1326,
 7964,
 4673,
 420,
 7964,
 10162,
 1235,
 2993,
 1020,
 10162,
 32775,
 355,
 1408,
 11392,
 8168,
 1936,
 27810,
 672,
 189,
 7964,
 587,
 9993,
 15953,
 4673,
 33629,
 8168,
 22003,
 10740,
 355,
 6187,
 3

In [19]:
tokenizer.decode(tokenized_ds[2]["labels"][100:])

'通常围绕核运动，形成若干层次，称为壳层或电子层。电子数量与质子数量相等，使原子呈电中性。\n\n电子在每个壳层中都呈规律分布，并且不同壳层所能容纳的电子数也不同。在最里面的壳层一般只能容纳2个电子，其次一层最多可容纳8个电子，再往外的壳层可容纳的电子数逐层递增。\n\n原子核主要受到两种相互作用力的影响：强力和电磁力。强力的作用范围非常小，主要限制在原子核内，具有极强的吸引作用，使核子（质子和中子）紧密结合在一起。电磁力的作用范围较大，主要通过核外的电子与原子核相互作用，发挥作用。\n\n这就是原子的'

In [20]:
len(tokenized_ds[2]["labels"])

256

## 4. Create model

In [21]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-1b4-zh", low_cpu_mem_usage = True)

In [22]:
model.device

device(type='cpu')

In [23]:
model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(46145, 2048)
    (word_embeddings_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (l

In [24]:
sum(param.numel() for param in model.parameters())

1303111680

### P tuning
Huggingface has P tuning package

In [25]:
import peft
# peft.__version__

In [26]:
peft.__version__

'0.17.0'

In [27]:
from peft import PromptEncoderConfig, get_peft_model, TaskType, PromptEncoderReparameterizationType

#### PEFT step 1, configuration

In [28]:
#hard prompt
config = PromptEncoderConfig(task_type = TaskType.CAUSAL_LM,
                             num_virtual_tokens = 10,
                             #by default it's MLP structure, you can also set it to LSTM, which would cost more compute resource
                            #encoder_reparameterization_type=PromptEncoderReparameterizationType.LSTM,
                            #encoder_dropout=0.1, encoder_num_layers=5, encoder_hidden_size=1024
                            )

In [29]:
config

PromptEncoderConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.P_TUNING: 'P_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, num_virtual_tokens=10, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, modules_to_save=None, encoder_reparameterization_type=<PromptEncoderReparameterizationType.MLP: 'MLP'>, encoder_hidden_size=None, encoder_num_layers=2, encoder_dropout=0.0)

In [30]:
from peft import PromptEncoder
PromptEncoder??

[0;31mInit signature:[0m [0mPromptEncoder[0m[0;34m([0m[0mconfig[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mPromptEncoder[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    The prompt encoder network that is used to generate the virtual token embeddings for p-tuning.[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        config ([`PromptEncoderConfig`]): The configuration of the prompt encoder.[0m
[0;34m[0m
[0;34m    Example:[0m
[0;34m[0m
[0;34m    ```py[0m
[0;34m    >>> from peft import PromptEncoder, PromptEncoderConfig[0m
[0;34m[0m
[0;34m    >>> config = PromptEncoderConfig([0m
[0;34m    ...     peft_type="P_TUNING",[0m
[0;34m    ...     task_type="SEQ_2_SEQ_LM",[0m
[0;34m    ...     num_virtual_tokens=20,[0m
[0;34m    ...     token_dim=768,[0m
[0;34m    ...     num_transformer_submodules=1,[0m
[0;34m    ...

#### PEFT step 2, create model

In [31]:
model = get_peft_model(model, config)

In [34]:
model

PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(46145, 2048)
      (word_embeddings_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
            (dense): Linear(in_features=2048, out_features=2048, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          )
        )
      )

**The new model has a**
```python
(prompt_encoder): ModuleDict(
    (default): PromptEncoder(
      (embedding): Embedding(10, 2048)
      (mlp_head): Sequential(
        (0): Linear(in_features=2048, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=2048, bias=True)
        (3): ReLU()
        (4): Linear(in_features=2048, out_features=2048, bias=True)
      )
    )
  )
  (word_embeddings): Embedding(46145, 2048)
```

In [36]:
config

PromptEncoderConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.P_TUNING: 'P_TUNING'>, auto_mapping=None, base_model_name_or_path='Langboat/bloom-1b4-zh', revision=None, inference_mode=False, num_virtual_tokens=10, token_dim=2048, num_transformer_submodules=1, num_attention_heads=16, num_layers=24, modules_to_save=None, encoder_reparameterization_type=<PromptEncoderReparameterizationType.MLP: 'MLP'>, encoder_hidden_size=2048, encoder_num_layers=2, encoder_dropout=0.0)

In [35]:
model.print_trainable_parameters()

trainable params: 12,609,536 || all params: 1,315,721,216 || trainable%: 0.9584


The parameters that need to be adjusts is decreased from 1.3B to 12M.

## 5. Configure the training

In [37]:
args = TrainingArguments(
    output_dir = "./chatbot", #to store the prediction results and checkpoints of the model file
    per_device_train_batch_size = 1, #8 by default
    gradient_accumulation_steps = 8, #1 by default, calculate 8 times of gradient then update the parameters in back propagation, this is more efficient
    logging_steps = 10,
    num_train_epochs = 1, # number of times to let the model learn
)

## 6. Create the trainer

In [38]:
trainer = Trainer(
    model = model, #the model with frozen parameters
    args = args,
    train_dataset = tokenized_ds,
    #To builde one batch
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, padding = True)   
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


`data_collator` is responsible for taking a list of dataset samples (from the DataLoader) and converting it into a single batch.

Think of it as:

- The function that builds each batch during training or evaluation.

- It handles padding, truncation, and tensor conversion.

#### Below is the training of the random initialized virtual tokens.

In [39]:
trainer.train()

Step,Training Loss
10,2.4635
20,2.4089


KeyboardInterrupt: 

In [32]:
from peft import PeftModel

In [33]:
peft_model = PeftModel.from_pretrained(model = model, model_id = "../checkpoint-500/")

## 7. Model reasoning

In [34]:
peft_model.device

device(type='cpu')

In [35]:
ipt = tokenizer("Human: {}\n{}".format("如何提高学习效率？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(peft_model.device)
#Reconvert the result from model to text
print(tokenizer.decode(peft_model.generate(**ipt, max_length=256, do_sample=True)[0], skip_special_tokens=True))

AttributeError: 'PrefixEncoder' object has no attribute 'transform'