In [None]:
# transformers not support NumPy 2.0 yet
!pip install -q numpy~=1.26.4 transformers~=4.46.2
!pip install -q datasets~=3.2.0 pydantic~=2.10.4
!pip install -q peft~=0.14.0 evaluate~=0.4.3 sacrebleu~=2.5.1
!pip install -q pandas==2.0.3 # Downgrade pandas to a compatible version

In [None]:
import os

try:
  # Attempt to get the notebook filename from the IPython environment
  __ipynb_file__ = os.path.splitext(os.path.basename(os.environ['JPY_SESSION_NAME']))[0]
except (NameError, KeyError):
  # Fallback to a default value if the variable is not found
  __ipynb_file__ = 'default_notebook_name'

# 訓練 PII 遮掩模型

在這個筆記本中，我們將展示如何使用 `transformers` 套件訓練 PII (個人識別資訊) 遮掩模型。我們將使用 `transformers` 套件中的 [`Seq2SeqTrainer`](https://huggingface.co/docs/evaluate/transformers_integrations#seq2seqtrainer) 類別來微調一個 Encoder-Decoder 架構的 [Flan T5](https://huggingface.co/docs/transformers/model_doc/t5) 模型。

> Flan-T5: Flan is a pretraining methods that is based on prompting.

In [None]:
import pandas as pd
import numpy as np
import evaluate

from transformers import (
  AutoTokenizer,
  DataCollatorForSeq2Seq,
  Seq2SeqTrainingArguments,
  Seq2SeqTrainer,
  T5ForConditionalGeneration,
)
from datasets import load_dataset, DatasetDict

from typing import Any
from pydantic import BaseModel
from pprint import pprint

import torch

# 載入 PEFT 相關套件
from peft import LoraConfig, TaskType, PeftModel, get_peft_model

# 檢查是否有 GPU 可以使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)

  from .autonotebook import tqdm as notebook_tqdm


## 下載資料

In [None]:
# The full `train` split, only 25% of dataset
immutable_dataset = load_dataset("ai4privacy/pii-masking-65k", split="train[:25%]")

Using the latest cached version of the dataset since ai4privacy/pii-masking-65k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/jonas/.cache/huggingface/datasets/ai4privacy___pii-masking-65k/default/0.0.0/79cf9b886f5075a98121a2f0c23f7eae201bff1b (last modified on Wed Feb 12 23:36:48 2025).


  ### 資料包含什麼？

In [None]:
# 顯示原始資料中包含的 features 以及筆數
immutable_dataset

Dataset({
    features: ['masked_text', 'unmasked_text', 'token_entity_labels', 'tokenised_unmasked_text'],
    num_rows: 5397
})

In [None]:
# 檢視資料集中的第一筆資料
pd.set_option('display.max_colwidth', None)
pd.DataFrame(immutable_dataset[:1])

Unnamed: 0,masked_text,unmasked_text,token_entity_labels,tokenised_unmasked_text
0,"[PREFIX_1] [FIRSTNAME_1] [MIDDLENAME_1] [LASTNAME_1], as a [JOBDESCRIPTOR_1] [JOBTITLE_1] at [COMPANY_NAME_1], your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy.","Mr. Adolphus Reagan Ziemann, as a Central Principal Applications Executive at McLaughlin, Nader and Purdy, your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy.","[B-PREFIX, I-PREFIX, B-FIRSTNAME, I-FIRSTNAME, B-MIDDLENAME, B-LASTNAME, I-LASTNAME, I-LASTNAME, O, O, O, B-JOBDESCRIPTOR, B-JOBTITLE, I-JOBTITLE, I-JOBTITLE, O, B-COMPANY_NAME, I-COMPANY_NAME, I-COMPANY_NAME, I-COMPANY_NAME, I-COMPANY_NAME, I-COMPANY_NAME, I-COMPANY_NAME, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[mr, ., adolph, ##us, reagan, z, ##ie, ##mann, ,, as, a, central, principal, applications, executive, at, mclaughlin, ,, nad, ##er, and, pu, ##rdy, ,, your, knowledge, of, change, management, is, vital, for, our, company, ', s, transformation, ., we, request, you, to, create, a, change, management, strategy, .]"


這個表格結構，包含四個欄位，分別是：

* `masked_text`: 這是一個包含 PII 遮掩的文本，我們將使用這個文本來訓練模型，以此為輸出。

* `unmasked_text`: 這是一個包含 PII 資訊的原始文本，我們將使用這個文本來訓練模型，以此為輸入。

其餘 `token_entity_labels` 及 `tokenised_unmasked_text` 是我們在訓練過程中不會使用到的欄位。


### 資料前處理

方便演示及訓練，我們將對資料進行以下前處理：

1. 保留 `masked_text` 及 `unmasked_text` 欄位。
2. 將資料及分為訓練集，驗證集及測試集。

In [None]:
# 保留必要 features: 'masked_text', 'unmasked_text'
dataset = immutable_dataset.remove_columns(['token_entity_labels', 'tokenised_unmasked_text'])
# 顯示處理後的資料
dataset

Dataset({
    features: ['masked_text', 'unmasked_text'],
    num_rows: 5397
})

In [None]:
# Reserve 0.1% of the training set for testing
test_dataset = dataset.train_test_split(
  test_size=0.001, # 0.1% of the data is used for testing
  shuffle=False, # Ensure that train and validation sets are the same across runs
  )
# Split into 80% training and 20% validation sets
train_dataset = test_dataset['train'].train_test_split(
  test_size=0.2, # 20% of the data is used for validation
  shuffle=False, # Ensure that train and test sets are the same across runs
  )
dataset = DatasetDict({
  'train': train_dataset['train'],
  'validation': train_dataset['test'],
  'test': test_dataset['test'],
  })
# 顯示處理後的資料
dataset

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 4312
    })
    validation: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 1079
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 6
    })
})

In [None]:
# 顯示前 first_n_data 筆資料
first_n_data = 3
pd.set_option('display.max_colwidth', None)
pd.DataFrame(dataset['train'].select(range(first_n_data)))

Unnamed: 0,masked_text,unmasked_text
0,"[PREFIX_1] [FIRSTNAME_1] [MIDDLENAME_1] [LASTNAME_1], as a [JOBDESCRIPTOR_1] [JOBTITLE_1] at [COMPANY_NAME_1], your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy.","Mr. Adolphus Reagan Ziemann, as a Central Principal Applications Executive at McLaughlin, Nader and Purdy, your knowledge of change management is vital for our company's transformation. We request you to create a change management strategy."
1,"Hello [FIRSTNAME_1], would you please investigate the potential fallouts associated with the revisions in the [JOBAREA_1] department? Please incorporate your findings in your management strategy required previously.","Hello Hannah, would you please investigate the potential fallouts associated with the revisions in the Security department? Please incorporate your findings in your management strategy required previously."
2,We also request a review of our policies with respect to the upcoming changes and to bring in your expertise in case a policy change is advised. You can communicate the updates via email at [EMAIL_1].,We also request a review of our policies with respect to the upcoming changes and to bring in your expertise in case a policy change is advised. You can communicate the updates via email at Bartholome_Goldner85@yahoo.com.


## 訓練參數

### 批次大小 (Batch Size) 和 梯度累積步數 (Gradient Accumulation Steps)

批次大小（batch size）和梯度累積步數（gradient accumulation steps）之間的關係可以簡單地說明如下：

* 批次大小（batch size）：每次訓練迭代中使用的樣本數量。較大的批次大小通常需要更多的內存。
* 梯度累積步數（gradient accumulation steps）：在更新模型權重之前累積梯度的迭代次數。這允許使用較小的批次大小來模擬較大的批次大小。

當內存限制無法直接使用大批次大小時，可以通過梯度累積來實現。例如：

* 如果批次大小是 8，梯度累積步數是 4，這相當於使用批次大小為 32（8 * 4）進行訓練。

這樣可以在內存有限的情況下實現大批次大小的效果。

### 訓練設定

In [None]:
# 訓練相關設定
class Config(BaseModel):
  model_name: str = 'google/flan-t5-base' # Fine-tuned Language Network with T5
  saved_model_path: str = os.path.join('saved_model', f'{__ipynb_file__}') # path to save the trained model
  saved_lora_path: str = os.path.join('saved_model', f'{__ipynb_file__}_lora') # path to save the trained LORA model
  batch_size: int = 4 # size of the input batch in training and evaluation
  gradient_accumulation_steps: int = 2 # number of updates steps to accumulate before performing a backward/update pass
  epochs: int = 2 # number of times to iterate over the entire training dataset
  lr: float = 2e-4 # learning rate, controls how fast or slow the model learns
  weight_decay: float = 0.01 # weight decay, helps the model stay simple and avoid overfitting by penalizing large weights.
  eval_metric: str = 'bleu' # evaluation metric, 'bleu' or 'sacrebleu'

  # 文本生成相關設定
  temperature: float = 0.1 # temperature for sampling
  max_new_tokens: int = 125 # 限制最大生成字數
  repetition_penalty: float = 1.5 # 重複機率, 1~2 之間, 1.0 (no penalty), 2.0 (maximum penalty)

  # LORA 相關設定
  rank: int = 128 # rank of the Lora layers
  lora_alpha: int = rank * 2 # alpha for Lora scaling.
  lora_dropout: float = 0.05 # dropout probability for Lora layers

config = Config()

## Fine-tuning 前的表現

### 載入預訓練分詞器 (Tokenizer)

In [None]:
# 透過預訓練模型取得 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  config.model_name,
)
pprint(tokenizer)

T5TokenizerFast(name_or_path='google/flan-t5-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>'

In [None]:
# 檢視 Tokenizer，是否存在 padding token 及 padding side 等資訊
pprint(tokenizer.pad_token)

'<pad>'


In [None]:
pprint(tokenizer.padding_side)

'right'


* 如果沒有定義 `pad_token`，請定義一個 `pad_token`，並將其加入 Tokenizer 中。
* 如果 `padding_side` 不是 `right`，請將其設定為 `right`。

In [None]:
# Add pad_token to the tokenizer
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  print('=== 設定 Padding Token ===')
  pprint(tokenizer)
# Make sure padding_side is 'right'
if tokenizer.padding_side != 'right':
  tokenizer.padding_side = 'right'
  print('=== 設定 Padding Side ===')
  pprint(tokenizer)

### 載入預訓練模型

透過 `T5ForConditionalGeneration` 用於摘要，翻譯等序列映射到序列 (Sequence to Sequence, Seq2Seq) 任務的類別，它可以載入預訓練的 Flan T5 模型進行文本生成任務。

In [None]:
model = T5ForConditionalGeneration.from_pretrained(
  config.model_name,
).to(device)

In [None]:
pprint(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

這是一個典型的 Encoder-Decoder 模型。

```json
T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    ...
  )
  (decoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    ...
  )
  (lm_head): Linear(in_features=512, out_features=32128, bias=False)
)    
```

### 詠唱格式化 (Prompt Formatting)

定義詠唱 (Prompt) 格式，我們將創建一個格式化函數。

In [None]:
system_message = 'Mask the personal identifiable information:'

def instruction_formatter(x, tokenize: bool = False):
  input = f"{system_message} {x['unmasked_text']}"
  if tokenize:
    return tokenizer(
      [input],
      max_length=tokenizer.model_max_length,
      truncation=True,
      padding=True,
      return_tensors='pt',
    ).to(device)
  else:
    return input


In [None]:
# tokenize=False 代表不進行 Tokenize，直接回傳原始文字
input = instruction_formatter(dataset['test'][0], tokenize=False)
pprint(input)

('Mask the personal identifiable information: We need a comprehensive review '
 'of privacy laws and regulations regarding the handling and sharing of masked '
 'credit card numbers, such as 1160254613057002.')


In [None]:
# tokenize=True 代表進行 Tokenize，回傳 Tokenize 後的 ID 及 attention mask tensors
tokenized_input = instruction_formatter(dataset['test'][0], tokenize=True)
pprint(tokenized_input)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0'),
 'input_ids': tensor([[23709,     8,   525, 22185,   251,    10,   101,   174,     3,     9,
          3452,  1132,    13,  4570,  3786,    11,  4750,  1918,     8,  5834,
            11,  2178,    13,     3,    51, 23552,   998,   895,  2302,     6,
           224,    38,   850,  3328,  1828,  4448, 21448, 28363, 12328,     1]],
       device='mps:0')}


Tokenizer 回傳內容包含兩個主要部分：`input_ids` 和 `attention_mask`。以下是詳細解釋：

* `input_ids`: 是一個張量 (tensor)，包含了輸入文本的 token IDs。這些 IDs 是由 tokenizer 將文本轉換為數字表示後得到的。

* `attention_mask`: 同樣是一個張量，用於指示模型應該關注哪些位置。值為 1 的位置表示應該關注，值為 0 的位置表示應該忽略。在這個例子中，`attention_mask` 的值全為 1，表示模型應該關注所有位置。

In [None]:
# 透過 Tokenizer 的 decode 方法將 ID 轉換回文字，並列顯示出來
for id in tokenized_input['input_ids'][0]:
  print(f'{id} -> {tokenizer.decode([id])}')

23709 -> Mask
8 -> the
525 -> personal
22185 -> identifiable
251 -> information
10 -> :
101 -> We
174 -> need
3 -> 
9 -> a
3452 -> comprehensive
1132 -> review
13 -> of
4570 -> privacy
3786 -> laws
11 -> and
4750 -> regulations
1918 -> regarding
8 -> the
5834 -> handling
11 -> and
2178 -> sharing
13 -> of
3 -> 
51 -> m
23552 -> asked
998 -> credit
895 -> card
2302 -> numbers
6 -> ,
224 -> such
38 -> as
850 -> 11
3328 -> 60
1828 -> 25
4448 -> 46
21448 -> 130
28363 -> 570
12328 -> 02.
1 -> </s>


經過 Tokenizer 處理後再還原回文本，我們可以看到 Tokenizer 處理後的文本最後多了一個 `</s>` token，這是因為 T5 模型的輸入文本需要以 `</s>` 結尾。`</s>` token 用於指示模型輸入的結束，即 `eos_token`。

### Fine-tuning 前的表現

#### 單筆演示生成回應

In [None]:
# 透過預訓練模型生成回應
output_ids = model.generate(
  **tokenized_input,
  temperature=config.temperature,
  max_new_tokens=config.max_new_tokens,
  repetition_penalty=config.repetition_penalty,
)

  test_elements = torch.tensor(test_elements)


In [None]:
output_ids

tensor([[    0,     3,    51, 23552,   998,   895,  2302,     1]],
       device='mps:0')

In [None]:
# 將 output_ids 轉換為文字
output = tokenizer.decode(
  output_ids[0],
  skip_special_tokens=False, # 決定是否跳過特殊 token（例如，開始和結束標記）。
)

In [None]:
pprint(output)

'<pad> masked credit card numbers</s>'


#### 批次處理模型表現

初步了解如何生成模型的回應，我們將定義一個 `generate()` 函數來生成模型的回應。這個函數接受一個輸入文本，並生成模型的回應。藉由這個函數，我們可以批次處理資料。


In [None]:
# 將以上程式碼整理成一個函式，方便我們批次處理資料
def generator(x, model):
  tokenized_input = instruction_formatter(x, tokenize=True)
  output_ids = model.generate(
    **tokenized_input,
    temperature=config.temperature,
    max_new_tokens=config.max_new_tokens,
    repetition_penalty=config.repetition_penalty,
  )
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
# 這個步驟可能會花費一些時間，所以我們只處理前 first_n_data 筆資料
first_n_dataset = dataset['test'].select(range(first_n_data))

# 透過預訓練模型生成回應，將其新增到 first_n_dataset 的 pt_response 欄位中
first_n_dataset = first_n_dataset.map(
  lambda x: {
    **x,
    "pt_response": generator(x, model),
  },
  batched=False,
)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.54s/ examples]


In [None]:
# 顯示預訓練模型預測結果
pd.set_option('display.max_colwidth', None)
pd.DataFrame(first_n_dataset)

Unnamed: 0,masked_text,unmasked_text,pt_response
0,"We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as [MASKEDNUMBER_1].","We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as 1160254613057002.",masked credit card numbers
1,Our client [LASTNAME_1] has a query about privacy laws regarding the use and storage of their device's MAC address [MAC_1].,Our client VonRueden has a query about privacy laws regarding the use and storage of their device's MAC address 97:f0:ee:00:e3:09.,MAC address: Our client VonRueden has a query about privacy laws regarding the use and storage of their device's MAC address 97:f0:ee:00:e3:09.
2,"Is it legal to store user agent information, such as [USERAGENT_1] in accordance with privacy laws?","Is it legal to store user agent information, such as Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10.2; rv:9.8) Gecko/20100101 Firefox/9.8.0 in accordance with privacy laws?",no


## 訓練模型

### LoRA 的訓練策略

LoRA（Low-Rank Adaptation）是一種用於訓練大型語言模型的技術，旨在提高訓練效率並減少計算資源的需求。以下是為何需要透過LoRA訓練的一些原因：

* 降低計算成本：LoRA 通過將模型的權重矩陣分解為低秩矩陣，顯著減少了參數的數量，從而降低了計算成本和內存需求。

* 加速訓練速度：由於參數數量減少，LoRA 可以加速模型的訓練過程，使得在相同的硬件資源下能夠更快地完成訓練。

![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png)

In [None]:
# 查看預訓練模型可訓練的參數量，其數量相當龐大，所以需要透過 Low Rank Adaptation (LoRA) 來降低參數量
print('Parameters: {:,}, Trainable Parameters: {:,}'.format(
  model.num_parameters(),
  model.num_parameters(only_trainable=True)))

Parameters: 247,577,856, Trainable Parameters: 247,577,856


#### LoRA 配置

* `task_type`: TaskType.SEQ_2_SEQ_LM 指定任務類型為序列到序列的語言模型 (Sequence-to-Sequence Language Model)。

* `rank`: 是低秩矩陣的秩(rank)，它決定了 LoRA 層的參數數量。較低的 `r` 值意味著較少的參數，從而減少了模型的計算和存儲需求。具體來說，LoRA 通過將全連接層的權重矩陣分解為兩個低秩矩陣來實現參數高效化。`r` 值越小，這兩個低秩矩陣的維度越小，這個練習我們採用 128。

* `lora_alpha`: 是一個縮放因子，用於調整 LoRA 層的輸出。它控制了低秩矩陣的影響力。較高的 `lora_alpha` 值會增加 LoRA 層的影響力，也就是說值越高，越容易把大模型既有的能力給覆蓋掉。具體來說，LoRA 層的輸出會乘以這個縮放因子，這個練習我們採用常見的比例為 `rank` 的兩倍。

* `lora_dropout`: 是一個丟棄率，用於在訓練過程中隨機丟棄 LoRA 層的一部分輸出。這有助於防止過擬合，並提高模型的泛化能力。例如，`lora_dropout` 設置為 0.1 表示在每次前向傳播中，有 10% 的 LoRA 層輸出會被隨機設置為零。

* `target_module`: 指定了應用 LoRA 的目標模塊。這通常是模型中的某些特定層或子模塊，例如 Transformer 模型中的注意力層，可以透過 `model.named_parameters` 查看。通過指定 `target_module`，你可以靈活地選擇在哪些層應用 LoRA，以便在保持模型性能的同時減少參數數量。

> 廣為周知的模型當未指定 `target_module`，透過 `get_peft_model` 加載 Lora 適配模型時，會自動設定。
> 可以先嘗試不指定，若出現錯誤再試著設定注意力相關的參數層。


In [None]:
# LoRA 配置
lora_config = LoraConfig(
  task_type=TaskType.SEQ_2_SEQ_LM,
  r=config.rank,
  lora_alpha=config.lora_alpha,
  lora_dropout=config.lora_dropout,
  # target_modules=['v', 'q'], # 有別於 Phi3ForCausalLM 需要指定 target_modules, 這次 LoRA 可以自動判斷
)

pprint(lora_config)

LoraConfig(task_type=<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path=None,
           revision=None,
           inference_mode=False,
           r=128,
           target_modules=None,
           exclude_modules=None,
           lora_alpha=256,
           lora_dropout=0.05,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           loftq_config={},
           eva_config=None,
           use_dora=False,
           layer_replication=None,
           runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False),
           lora_bias=False)


#### 加載 LoRA 適配模型

搭配預訓模型及 LoRA 配置，我們可以加載 LoRA 適配模型。我們可以觀察受到降維影響的模型層。

In [None]:
# 加載 LoRA 適配模型
peft_model = get_peft_model(
  model, # 預訓練模型
  lora_config, # LoRA 配置
)

In [None]:
pprint(lora_config)

LoraConfig(task_type=<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path='google/flan-t5-base',
           revision=None,
           inference_mode=False,
           r=128,
           target_modules={'v', 'q'},
           exclude_modules=None,
           lora_alpha=256,
           lora_dropout=0.05,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           loftq_config={},
           eva_config=None,
           use_dora=False,
           layer_replication=None,
           runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False),
           lora_bias=False)


#### LoRA 適配模型

加載 LoRA 適配模型後, 觀察受 LoRA 影響的模型參數

In [None]:
peft_model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=128, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=128, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
          

雖然這次我們沒有特別指定 `target_module`，但是在這個例子中，我們使用的是 Flan T5 模型，因此預設 Ｑ 及 Ｖ 注意力層受到 LoRA 的影響。

```json
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=128, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=128, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (k): Linear(in_features=512, out_features=384, bias=False)
                  (v): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=128, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=128, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (o): Linear(in_features=384, out_features=512, bias=False)
                  (relative_attention_bias): Embedding(32, 6)
                )
```              

訓練參數量也從原先 247M 大大減少為 14M。

In [None]:
# 查看可訓練的參數量
peft_model.print_trainable_parameters()

trainable params: 14,155,776 || all params: 261,733,632 || trainable%: 5.4085


### 資料預處理

在訓練模型之前，我們需要對資料進行預處理。這包括將文本轉換為模型可以理解的格式，包含輸入及輸出標籤 (Label)。

#### 定義預處理函數

In [None]:
def preprocess_function(dataset):
  inputs = [ f'{system_message} {q}' for q in dataset['unmasked_text'] ]
  tokenized_inputs = tokenizer(
    inputs,
    max_length=tokenizer.model_max_length,
    truncation=True,
    padding=True,
    return_tensors='pt',
  )
  # Temporarily sets the tokenizer for encoding the targets.
  # Useful for tokenizer associated to sequence-to-sequence models that need different processing for the labels.
  with tokenizer.as_target_tokenizer():
    input_labels = tokenizer(
      dataset['masked_text'],
      max_length=tokenizer.model_max_length,
      truncation=True,
      padding=True,
      return_tensors='pt',
    )
  # Replace all EOS tokens with -100 after eos_token_id for the labels
  input_labels['input_ids'] = input_labels['input_ids'].masked_fill(
    input_labels['input_ids'] == tokenizer.eos_token_id,
    -100,
  )
  tokenized_inputs['labels'] = input_labels['input_ids']
  return tokenized_inputs

In [None]:
pprint(preprocess_function(dataset['test'][:first_n_data]))

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[23709,     8,   525, 22185,   251,    10,   101,   174,     3,     9,
          3452,  1132,    13,  4570,  3786,    11,  4750,  1918,     8,  5834,
            11,  2178,    13,     3,    51, 23552,   998,   895,  2302,     6,
           224,    38,   850,  3328,  1828,  4448, 21448, 28363, 12328,     1



經過預處理後， 輸入文本將被轉換為模型可以理解的格式，包含 `input_ids` 和 `attention_mask`；輸出文本將被轉換為 `labels`，且 EOS token 會被置換為 -100，以確保填充標記被損失函數忽略。

#### 批次處理資料

使用 `Dataset.map()` 方法，選項設置為 `batched=True`。

In [None]:
tokenized_dataset = dataset.map(
  preprocess_function,
  batched=True, # 是否以批次進行處理
  remove_columns=dataset['train'].column_names, # 移除不必要的欄位
)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4312/4312 [00:00<00:00, 8577.39 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:00<00:00, 10109.62 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1806.72 examples/s]


### 資料校對器 (Data Collator)

在微調語言模型時，使用 data collator 是為了有效地準備和處理批次數據。以下是使用 data collator 的幾個主要原因：

* 動態填充 (Dynamic Padding): 不同長度的序列需要填充到相同的長度，以便能夠在同一批次中進行處理。Data collator 可以自動計算每個批次的最大長度，並對序列進行適當的填充。

* 批次處理 (Batch Processing): Data collator 可以將多個樣本組合成一個批次，這樣可以更高效地利用計算資源，特別是在使用 GPU 或 TPU 時。

* 生成注意力掩碼 (Attention Masks): 在填充序列時，data collator 會生成相應的注意力掩碼 (attention masks)，以確保模型只關注實際的數據部分，而忽略填充部分。

* 簡化代碼 (Code Simplification): 使用 data collator 可以簡化數據處理的代碼，減少手動處理數據的繁瑣步驟，讓開發者專注於模型設計和訓練。

總之，data collator 在微調語言模型時提供了便利和效率，確保數據能夠以一致且高效的方式進行處理。

在這邊我們使用 `DataCollatorForSeq2Seq` 是一個專門用於 BART 或 T5 這類 Seq2Seq 模型的數據整理器。

In [None]:
# 設定 DataCollatorForCompletionOnlyLM
data_collator = DataCollatorForSeq2Seq(
  tokenizer=tokenizer,
  model=peft_model,
)

In [None]:
# 展示 DataCollatorForSeq2Seq 的輸出
features = [tokenized_dataset["test"][:first_n_data]]
batch = data_collator(features)
pprint(batch)

{'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0]]]),
 'decoder_input_ids': tensor([[[    0,   101,   174,     3,     9,  3452,  1132,    13,  4570,  3786,
             11,  4750,  1918,     8,  5834,    11,  2178,    13,     3,    51

這裡要注意的主要是第一個例子與第二個例子長度不一，所以長度不足的例子的 `input_ids` 和 `attention_mask` 已經在右側填充了一個 [PAD] 標記（其 ID 是 0）。 類似地，我們可以看到 `labels` 已用 -100 填充，以確保填充標記被損失函數忽略。最後，我們可以看到一個新的 `decoder_input_ids`，它通過在第一個條目中插入 [PAD] 標記將標籤向右移動。

我們終於擁有了訓練所需的所有的前期準備！我們現在只需要使用標準參數實例化訓練器。

### 模型評估函數

在訓練過程中包含度量標準通常有助於評估模型的性能。您可以使用 Evaluate 庫快速加載評估方法。

BLEU 和 ROUGE 分數都是在機器翻譯任務中廣泛使用的重要評估指標，但它們側重的方面不同，BLEU 側重於精確度，而 ROUGE 側重於召回率。

* `BLEU (Bilingual Evaluation Understudy)`: BLEU 分數側重於精確度。主要目標是自動將文本從一種語言翻譯成另一種語言。BLEU 分數量化機器翻譯的文本與參考翻譯之間的相似性，這種測量是使用 n-gram 進行的。

* `SacreBLEU`: SacreBLEU 通過標準化和簡化使用過程，提供了一個更一致和方便的評分方法。SacreBLEU 的標準化計算方法確保了不同實驗和研究之間的分數具有可比性，並且自動處理常見的文本預處理步驟，使其更易於使用。

* `ROUGE (Recall-Oriented Understudy for Gisting Evaluation)`: ROUGE 分數側重於召回率。它將自動生成的摘要或翻譯與一個或多個參考進行比較。ROUGE 分數範圍從 0 到 1，反映了機器生成的摘要與參考之間的相似性，分數越高表示相似性越大。

In [None]:
metric = evaluate.load(config.eval_metric)

def compute_metrics(eval_pred):
    # Unpack predictions and labels from the input
    predictions, labels = eval_pred

    # Apply the tokenizer to decode the predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels to the tokenizer pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Apply the tokenizer to decode the labels
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute evaluation metrics for bleu
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
    )
    if 'bleu' in result: # for bleu
        return {"bleu": result["bleu"]}
    elif 'score' in result: # for sacrebleu
        return {"bleu": result["score"]}
    else:
        raise ValueError('The evaluation metric is not supported.')

### 訓練參數設定

用於設定訓練過程中的各種參數，如學習率、批次大小、梯度累積步數、訓練 epoch 數、權重衰減等。

* `output_dir` 指定了訓練輸出的目錄。
* `logging_steps` 訓練時的日誌步數，決定每隔多少步輸出一次訓練日誌。
* `eval_strategy` 和 `save_strategy` 設定為 'steps'，表示每 `logging_steps` 個 steps 都會進行評估和儲存。
* `load_best_model_at_end` 設定為 `True`，表示訓練結束後會載入最佳模型。
* `report_to` 設定為 'none'，禁用了 wandb 報告。
* `predict_with_generate` 設定為 `True`，表示在評估過程中使用生成的文本進行預測。
* `save_total_limit` 設定了最多儲存 5 個 checkpoints。

In [None]:
training_args = Seq2SeqTrainingArguments(
  output_dir=os.path.join('saved_model', f'{__ipynb_file__}_train_log'), # 訓練輸出目錄
  learning_rate=config.lr, # 學習率
  per_device_train_batch_size=config.batch_size, # 每個設備的訓練批次大小
  per_device_eval_batch_size=config.batch_size, # 每個設備的評估批次大小
  gradient_accumulation_steps=config.gradient_accumulation_steps, # 梯度累積步數
  logging_steps=100, # 訓練時的日誌步數, 預設每 500 步輸出一次日誌
  num_train_epochs=config.epochs, # 訓練的總 epoch 數
  weight_decay=config.weight_decay, # 權重衰減
  eval_strategy='steps', # 評估策略
  save_strategy='steps', # 儲存策略
  load_best_model_at_end=True, # 訓練完後載入最佳模型
  report_to='none', # 禁用 wandb 報告 (Colab 環境預設需要 wandb)
  predict_with_generate=True, # 對評估數據集生成文字，針對生成文字計算指標
  save_total_limit=5, # 最多儲存 5 個 checkpoints
)

### 訓練器初始化

用於初始化訓練器，並開始訓練模型。

* `model` 是要訓練的模型。
* `tokenizer` 是用於處理文本的分詞器。
* `train_dataset` 和 `eval_dataset` 是訓練和評估數據集。
* `data_collator` 是用於整理數據的數據整理器。
* `compute_metrics` 是用於計算度量標準的函數。

In [None]:
trainer = Seq2SeqTrainer(
    model=peft_model, # 要訓練的模型
    tokenizer=tokenizer, # 使用的分詞器
    args=training_args, # 訓練參數
    train_dataset=tokenized_dataset['train'], # 訓練數據集
    eval_dataset=tokenized_dataset['validation'], # 評估數據集
    data_collator=data_collator, # 數據整理器
    compute_metrics=compute_metrics, # 模型評估函數
)

  trainer = Seq2SeqTrainer(


### 開始訓練

In [None]:
# 開始訓練，這可能需要一些時間
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Bleu
100,5.2916,0.283348,0.24145
200,0.3338,0.119987,0.241154
300,0.1878,0.074505,0.234354
400,0.136,0.059236,0.234213
500,0.1112,0.048941,0.235815
600,0.0934,0.044429,0.236459
700,0.0827,0.03991,0.237017
800,0.0799,0.037173,0.235019
900,0.0734,0.035591,0.239071
1000,0.0744,0.035304,0.236368


  test_elements = torch.tensor(test_elements)
  test_elements = torch.tensor(test_elements)


TrainOutput(global_step=1078, training_loss=0.6050178195196092, metrics={'train_runtime': 3267.3594, 'train_samples_per_second': 2.639, 'train_steps_per_second': 0.33, 'total_flos': 2399136932966400.0, 'train_loss': 0.6050178195196092, 'epoch': 2.0})

訓練完成後，您可以通過運行 `Trainer.evaluate()` 方法在驗證集上評估模型的性能。它會計算模型的損失和其他評估指標，並返回這些結果。這對於了解模型在未見數據上的表現非常有用。

In [None]:
trainer.evaluate()

  test_elements = torch.tensor(test_elements)


{'eval_loss': 0.03530417010188103,
 'eval_bleu': 0.23636758649616496,
 'eval_runtime': 204.9096,
 'eval_samples_per_second': 5.266,
 'eval_steps_per_second': 1.318,
 'epoch': 2.0}

#### 保存 LoRA 模型參數

In [None]:
# 保存 Lora 参数
peft_model.save_pretrained(
  config.saved_lora_path,
)

#### 保存 Tokenizer

In [None]:
# 保存 Tokenizer
tokenizer.save_pretrained(config.saved_model_path)

('saved_model/pii-masking-encode-decode-dfca0739-dd45-403e-912e-dc82b3708a3f/tokenizer_config.json',
 'saved_model/pii-masking-encode-decode-dfca0739-dd45-403e-912e-dc82b3708a3f/special_tokens_map.json',
 'saved_model/pii-masking-encode-decode-dfca0739-dd45-403e-912e-dc82b3708a3f/spiece.model',
 'saved_model/pii-masking-encode-decode-dfca0739-dd45-403e-912e-dc82b3708a3f/added_tokens.json',
 'saved_model/pii-masking-encode-decode-dfca0739-dd45-403e-912e-dc82b3708a3f/tokenizer.json')

### 釋放資源

In [None]:
# import garbage collector
import gc

# 釋放 GPU 記憶體
del trainer
del tokenizer

peft_model.to('cpu')
del peft_model

torch.cuda.empty_cache()

gc.collect()

169

## 評估微調模型

### 載入微調分詞器 (Tokenizer)

從已經完成訓練的模型取得 Tokenizer，可以留意這個訓練時保存下來的 Tokenizer 仍保有訓練時的設定，包涵 `pad_token` 和 `padding_side`。

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
  config.saved_model_path
)

In [None]:
# 檢視 Tokenizer 是否存在 padding token 及 padding side 等資訊
pprint(tokenizer.pad_token)

'<pad>'


In [None]:
pprint(tokenizer.padding_side)

'right'


### 載入微調後模型

In [None]:
ft_model = PeftModel.from_pretrained(
  model, # 預訓練模型
  config.saved_lora_path, # LoRA 適配模型
).to(device)

### Fine-tuning 後的表現

In [None]:
# 透過微調模型生成回應，將其新增到 first_n_dataset 的 ft_response 欄位中
first_n_dataset = first_n_dataset.map(
  lambda x: {
    **x,
    "ft_response": generator(x, ft_model),
  },
  batched=False,
)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.06s/ examples]


In [None]:
# 顯示微調模型預測結果
pd.set_option('display.max_colwidth', None)
pd.DataFrame(first_n_dataset)

Unnamed: 0,masked_text,unmasked_text,pt_response,ft_response
0,"We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as [MASKEDNUMBER_1].","We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as 1160254613057002.",masked credit card numbers,"We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as [CREDITCARDNUMBER_1]. We need a comprehensive review of privacy laws and regulations regarding the handling and sharing of masked credit card numbers, such as [CREDITCARDISSUER_1]."
1,Our client [LASTNAME_1] has a query about privacy laws regarding the use and storage of their device's MAC address [MAC_1].,Our client VonRueden has a query about privacy laws regarding the use and storage of their device's MAC address 97:f0:ee:00:e3:09.,MAC address: Our client VonRueden has a query about privacy laws regarding the use and storage of their device's MAC address 97:f0:ee:00:e3:09.,Our client [MIDDLENAME_1] has a query about privacy laws regarding the use and storage of their device's MAC address [MAC_1]. Our client [JOBTITLE_1] has a query about privacy laws regarding the use and storage of their device's MAC address [MAC_1].
2,"Is it legal to store user agent information, such as [USERAGENT_1] in accordance with privacy laws?","Is it legal to store user agent information, such as Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10.2; rv:9.8) Gecko/20100101 Firefox/9.8.0 in accordance with privacy laws?",no,"Is it legal to store user agent information, such as [USERAGENT_1] in accordance with privacy laws?"


# (Optional) Download files from Colab workspace

In [None]:
# ![[ ! -z "${COLAB_GPU}" ]] && tar cvzf saved_encoder_model.tgz sample_data/saved_encoder_model/
# ![[ ! -z "${COLAB_GPU}" ]] && tar cvzf saved_lora_model.tgz sample_data/saved_lora_model/

In [None]:
# import os
# if 'COLAB_GPU' in os.environ:
#   from google.colab import files
#   files.download('saved_encoder_model.tgz')
#   files.download('saved_lora_model.tgz')