In [1]:
# transformers not support NumPy 2.0 yet
!pip install -q numpy~=1.26.4 transformers~=4.46.2
!pip install -q datasets pydantic peft
# !pip install -q bitsandbytes

# 訓練 PII 遮掩模型

In [2]:
import pandas as pd

from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  TrainingArguments,
  Trainer,
)
from datasets import load_dataset, DatasetDict
from transformers import (
  DataCollatorForSeq2Seq,
  BitsAndBytesConfig,
  pipeline,
)

from pydantic import BaseModel
from pprint import pprint

import torch

# 載入 PEFT 相關套件
from peft import LoraConfig, TaskType, PeftModel, get_peft_model

# 檢查是否有 GPU 可以使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else device)

  from .autonotebook import tqdm as notebook_tqdm


## 下載資料

In [3]:
# The full `train` split, only 50% of dataset
immutable_dataset = load_dataset("ai4privacy/pii-masking-65k", split="train[:50%]")

## 資料包含什麼？

In [4]:
# Reserve 0.05% of the training set for testing
test_dataset = immutable_dataset.train_test_split(
  test_size=0.0005, # 0.05% of the data is used for testing
  shuffle=True, # Ensure that train and validation sets are the same across runs
  )
# Split into 80% training and 20% validation sets
train_dataset = test_dataset['train'].train_test_split(
  test_size=0.2, # 20% of the data is used for validation
  shuffle=True, # Ensure that train and test sets are the same across runs
  )
immutable_dataset = DatasetDict({
  'train': train_dataset['train'],
  'validation': train_dataset['test'],
  'test': test_dataset['test'],
  })
immutable_dataset


DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'token_entity_labels', 'tokenised_unmasked_text'],
        num_rows: 17261
    })
    validation: Dataset({
        features: ['masked_text', 'unmasked_text', 'token_entity_labels', 'tokenised_unmasked_text'],
        num_rows: 4316
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text', 'token_entity_labels', 'tokenised_unmasked_text'],
        num_rows: 11
    })
})

In [5]:
# 保留必要 features: 'masked_text', 'unmasked_text'
dataset = immutable_dataset.remove_columns(['token_entity_labels', 'tokenised_unmasked_text'])
dataset

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 17261
    })
    validation: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 4316
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text'],
        num_rows: 11
    })
})

In [6]:
# 顯示前 first_n_data 筆資料
first_n_data = 3
pd.set_option('display.max_colwidth', None)
pd.DataFrame(dataset['train'].select(range(first_n_data)))

Unnamed: 0,masked_text,unmasked_text
0,Could you please analyze if the website visited most frequently by our user agent software [USERAGENT_1] is necessary for our operation?,"Could you please analyze if the website visited most frequently by our user agent software Mozilla/5.0 (Windows; U; Windows NT 5.0) AppleWebKit/535.1.1 (KHTML, like Gecko) Chrome/14.0.808.0 Safari/535.1.1 is necessary for our operation?"
1,"Dear Team, we have received several reports about software issues within the vicinity of IP [IP_1]. Please investigate this matter urgently, and update [EMAIL_1] with your findings.","Dear Team, we have received several reports about software issues within the vicinity of IP 97fe:fef5:3941:ab2c:58dc:bb80:a8a6:03ac. Please investigate this matter urgently, and update Lilliana.Greenholt56@yahoo.com with your findings."
2,Abbiamo individuato alcune comunità online che discutono attivamente dei nostri prodotti e servizi. Apprezzeremmo se potesse tenere d'occhio queste comunità e segnalare eventuali discussioni o feedback importanti. Prestare particolare attenzione a eventuali post degli utenti nella [STATE_1] [COUNTY_1]. Abbiamo bisogno di questi dati per perfezionare la nostra strategia di relazioni pubbliche.,Abbiamo individuato alcune comunità online che discutono attivamente dei nostri prodotti e servizi. Apprezzeremmo se potesse tenere d'occhio queste comunità e segnalare eventuali discussioni o feedback importanti. Prestare particolare attenzione a eventuali post degli utenti nella Illinois Buckinghamshire. Abbiamo bisogno di questi dati per perfezionare la nostra strategia di relazioni pubbliche.


## 訓練設定

In [7]:
# 訓練相關設定, 利用降低 batch size 提高 gradient accumulation steps 來節省記憶體
class Config(BaseModel):
  model_name: str = 'microsoft/Phi-3.5-mini-instruct'
  saved_model_path: str = 'sample_data/saved_encoder_model' # path to save the trained model
  saved_lora_path: str = 'sample_data/saved_lora_model' # path to save the trained LORA model
  train_batch_size: int = 2 # size of the input batch in training
  eval_batch_size: int = 2 # size of the input batch in evaluation
  gradient_accumulation_steps: int = 2 # number of updates steps to accumulate before performing a backward/update pass
  epochs: int = 1 # 測試至少 5 個 epochs 才有成效
  lr: float = 2e-5 # learning rate, controls how fast or slow the model learns
  weight_decay: float = 0.01 # weight decay, helps the model stay simple and avoid overfitting by penalizing large weights.

  # LORA 相關設定
  rank: int = 128 # rank of the PEFT model

config = Config()

In [8]:
# Quantization 壓縮設定，將 model 壓縮至 8-bit
# quantization_config = BitsAndBytesConfig(
#   load_in_8bit=True,
# )

## 先觀察 Fine-tuning 前的表現

### 詠唱格式化 (Prompt Formatting)

先定義我們的詠唱 (Prompt) 格式。為此，我們將創建一個格式化函數。

In [9]:
def instruction_formatter(x):
  text = f'''
    Given the information below, mask the personal identifiable information.

    Input: {x['unmasked_text']}

    Output:
  '''

  return text

In [10]:
# 透過預訓練模型取得 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  config.model_name,
)
# 檢視 Tokenizer，是否存在 PADDING token?
pprint(tokenizer)

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [12]:
# Add PADDING token to the tokenizer
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  print('=== 設定 Padding Token ===')
  pprint(tokenizer)
# Make sure padding_side is 'right'
if tokenizer.padding_side != 'right':
  tokenizer.padding_side = 'right'
  print('=== 設定 Padding Side ===')
  pprint(tokenizer)

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [13]:
# 半精度浮點數訓練
model = AutoModelForCausalLM.from_pretrained(
  config.model_name,
  torch_dtype=torch.bfloat16,
  low_cpu_mem_usage=True,
#  quantization_config=quantization_config,
).to(device)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
# 獲取模型參數名稱及型態
for name, param in model.named_parameters():
  print(f'{name}: {param.dtype}')

In [12]:
# 載入預訓練模型
generator = pipeline(
  task='text-generation',
  model=model,
  tokenizer=tokenizer,
  device=device,
)

In [13]:
# 顯示預訓練模型預測結果
input = instruction_formatter(dataset['test'][0])
response = generator(
  input,
  max_new_tokens=512, # 限制最大生成字數
  repetition_penalty=1.5, # 重複機率, 1~2 之間, 1.0 (no penalty), 2.0 (maximum penalty)
)
print(response[0]['generated_text'])

  test_elements = torch.tensor(test_elements)



    Given the information below, mask the personal identifiable information.

    Input: To ensure network security in our Configuration, kindly change your system password to pEyDYbbpBRSs at your earliest convenience.

    Output:
   If my service is up and running on a new computer with multiple CPUs but you still need an old school backup solution before sending us copies of them that would be extremely important not only for yourself when I put this software over there –but also because it makes using Microsoft's Azure services much easier since they give more secure access as well by adding back-end encryption functionality which reduces all those extra bitbuckets going missing (unless we're working directly from Amazon).


## 資料預處理

In [14]:
def process_func(x):
  token_ids, attention_mask, labels = [], [], []
  instruction = tokenizer(
    instruction_formatter(x),
    truncation=True)
  # Generate the response and add the end of string token
  response = tokenizer(
    x['masked_text'] + tokenizer.eos_token,
    truncation=True,)
  # Combine the instruction and response
  token_ids = instruction['input_ids'] + response['input_ids']
  attention_mask = instruction['attention_mask'] + response['attention_mask']
  # The labels are the response token ids, but ignore the instruction token ids by setting them to -100
  labels = [-100] * len(instruction['input_ids']) + response['input_ids']

  return {
    'input_ids': token_ids,
    'attention_mask': attention_mask,
    'labels': labels,
  }

In [15]:
tokenized_dataset = dataset.map(
  process_func,
  batched=False,
  remove_columns=dataset['train'].column_names,
)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 17261/17261 [00:07<00:00, 2286.91 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4316/4316 [00:01<00:00, 2290.18 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 536.32 examples/s]


In [16]:
# 檢視輸入資料
print('=== 輸入資料 ===')
print(tokenizer.decode(tokenized_dataset['train']['input_ids'][0]))
print()
print('=== 標註資料 (非 -100) ===')
print(tokenizer.decode(
  list(filter(lambda x: x != -100, tokenized_dataset['train']['labels'][0]))
))


=== 輸入資料 ===

    Given the information below, mask the personal identifiable information.

    Input: Dans le cas où vous décideriez de procéder au paiement en cryptomonnaie, veuillez trouver notre adresse Ethereum 0xbf9555a9cc7d3ca2bfab02bfdf43210a8cc5bb0d pour des transactions rapides et faciles.

    Output:
  Dans le cas où vous décideriez de procéder au paiement en cryptomonnaie, veuillez trouver notre adresse Ethereum [ETHEREUMADDRESS_1] pour des transactions rapides et faciles.<|endoftext|>

=== 標註資料 (非 -100) ===
Dans le cas où vous décideriez de procéder au paiement en cryptomonnaie, veuillez trouver notre adresse Ethereum [ETHEREUMADDRESS_1] pour des transactions rapides et faciles.<|endoftext|>


## 訓練模型

您現在可以開始訓練您的模型了！使用 AutoModelForCausalLM 加載預訓練的模型：

In [18]:
# 查看可訓練的參數量
print('Parameters: {:,}, Trainable Parameters: {:,}'.format(
  model.num_parameters(),
  model.num_parameters(only_trainable=True)))

Parameters: 81,912,576, Trainable Parameters: 81,912,576


### LoRA 的訓練策略 - 降維打擊

LoRA（Low-Rank Adaptation）是一種用於訓練大型語言模型的技術，旨在提高訓練效率並減少計算資源的需求。以下是為何需要透過LoRA訓練的一些原因：

降低計算成本：LoRA 通過將模型的權重矩陣分解為低秩矩陣，顯著減少了參數的數量，從而降低了計算成本和內存需求。

加速訓練速度：由於參數數量減少，LoRA 可以加速模型的訓練過程，使得在相同的硬件資源下能夠更快地完成訓練。

![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png)

In [19]:
# PEFT 配置
lora_config = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  r=config.rank,
  target_modules=['qkv_proj'],
)
pprint(lora_config)

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path=None,
           revision=None,
           inference_mode=False,
           r=256,
           target_modules=None,
           exclude_modules=None,
           lora_alpha=8,
           lora_dropout=0.0,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           loftq_config={},
           eva_config=None,
           use_dora=False,
           layer_replication=None,
           runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False),
           lora_bias=False)


In [20]:
# 取得 PEFT 模型
peft_model = get_peft_model(
  model, # 預訓練模型
  lora_config, # PEFT 配置
)



In [22]:
# 取得 PEFT 模型, 觀察受 PEFT 影響的模型參數
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=256, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=256, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_ma

In [None]:
# 獲取模型參數名稱及型態
for name, param in peft_model.named_parameters():
  print(f'{name}: {param.dtype}')

In [None]:
# 同樣採用半精度浮點數訓練
peft_model = peft_model.half()

In [23]:
# 查看可訓練的參數量
peft_model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 86,631,168 || trainable%: 5.4468


In [24]:
training_args = TrainingArguments(
  output_dir='sample_data/train_output_pii_masking',
  learning_rate=config.lr,
  per_device_train_batch_size=config.train_batch_size,
  per_device_eval_batch_size=config.eval_batch_size,
  gradient_accumulation_steps=config.gradient_accumulation_steps,
  num_train_epochs=config.epochs,
  weight_decay=config.weight_decay,
  eval_strategy='epoch', # 每個 epoch 評估一次
  save_strategy='epoch', # 每個 epoch 儲存一次
  load_best_model_at_end=True,
  report_to='none', # Disable wandb on colab
  adam_epsilon=1e-4, # 當使用半精度浮點數時，需要設定較大的 adam epsilon
)

trainer = Trainer(
  model=peft_model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['validation'],
  data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True),
  tokenizer=tokenizer,
)

  trainer = Trainer(


In [25]:
# 開始訓練，這可能需要一些時間
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7512,0.562725
2,0.5838,0.442927
3,0.58,0.382811
4,0.5205,0.354371
5,0.4606,0.344917


TrainOutput(global_step=21580, training_loss=0.6572439197260986, metrics={'train_runtime': 12374.7877, 'train_samples_per_second': 6.974, 'train_steps_per_second': 1.744, 'total_flos': 5542076207185920.0, 'train_loss': 0.6572439197260986, 'epoch': 5.0})

In [26]:
# 保存 Lora 参数
peft_model.save_pretrained(f'{config.saved_lora_path)

In [27]:
# 合併原始模型和 Lora 参数
new_model = PeftModel.from_pretrained(model, config.saved_lora_path)

print("=== 合併前的模型結構 ===")
print(new_model)

# 合併並卸載 Lora 参数
new_model.merge_and_unload()

print("=== 合併後的模型結構 ===")
print(new_model)

# 儲存模型
# trainer.save_model(config.saved_model_path)

=== 合併前的模型結構 ===
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=256, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=256, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
       

In [28]:
# 保存合併後的模型
new_model.save_pretrained(config.saved_model_path)
tokenizer.save_pretrained(config.saved_model_path)

('sample_data/saved_encoder_model/tokenizer_config.json',
 'sample_data/saved_encoder_model/special_tokens_map.json',
 'sample_data/saved_encoder_model/vocab.json',
 'sample_data/saved_encoder_model/merges.txt',
 'sample_data/saved_encoder_model/added_tokens.json',
 'sample_data/saved_encoder_model/tokenizer.json')

In [29]:
# 載入新模型
generator = pipeline(
  task='text-generation',
  model=config.saved_model_path,
  device=device,
  tokenizer=tokenizer,
)

In [30]:
# 顯示新模型預測結果
input = instruction_formatter(dataset['test'][0])
response = generator(
  input,
  max_new_tokens=512, # 限制最大生成字數
  repetition_penalty=1.5, # 重複機率, 1~2 之間, 1.0 (no penalty), 2.0 (maximum penalty)
)
print(response[0]['generated_text'])

  test_elements = torch.tensor(test_elements)



    Given the information below, mask the personal identifiable information.

    Input: To ensure network security in our Configuration, kindly change your system password to pEyDYbbpBRSs at your earliest convenience.

    Output:
   [email protected] Create a virtual address inside this computer before moving from an attacker mode or destination and re-enter each one of three ways by default using ssh : mbox -O "https://localhost/1" /etc/.ssh_hostname if desired Please enter such command on any terminal like sudo ssl5, ezm7r4f2d6ac8c9e0ba0801af3ec80eb35cfb50ddef86a00bd8319be40fa8868ca74aa052618257522dc1315573844476777293637892748703942deb12655954ea09043355cd4590df2028876664244623176978bf96db11ae06da491094248341407841643264309224799582796061ffab21fd31fb81fe53ad63129488708986052004112879852459530760926540725530430856305306406405321576082082582562542662782982973093354497325923875840810964849860620519970780912770610600093303108505678559621595086070186596797210535916537916836910737435536